Line data Source code
1 : /* SSSE3 vector permutation AES for Libgcrypt
2 : * Copyright (C) 2014-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3 : *
4 : * This file is part of Libgcrypt.
5 : *
6 : * Libgcrypt is free software; you can redistribute it and/or modify
7 : * it under the terms of the GNU Lesser General Public License as
8 : * published by the Free Software Foundation; either version 2.1 of
9 : * the License, or (at your option) any later version.
10 : *
11 : * Libgcrypt is distributed in the hope that it will be useful,
12 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : * GNU Lesser General Public License for more details.
15 : *
16 : * You should have received a copy of the GNU Lesser General Public
17 : * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 : *
19 : *
20 : * The code is based on the public domain library libvpaes version 0.5
21 : * available at http://crypto.stanford.edu/vpaes/ and which carries
22 : * this notice:
23 : *
24 : * libvpaes: constant-time SSSE3 AES encryption and decryption.
25 : * version 0.5
26 : *
27 : * By Mike Hamburg, Stanford University, 2009. Public domain.
28 : * I wrote essentially all of this code. I did not write the test
29 : * vectors; they are the NIST known answer tests. I hereby release all
30 : * the code and documentation here that I wrote into the public domain.
31 : *
32 : * This is an implementation of AES following my paper,
33 : * "Accelerating AES with Vector Permute Instructions"
34 : * CHES 2009; http://shiftleft.org/papers/vector_aes/
35 : */
36 :
37 : #include <config.h>
38 : #include <stdio.h>
39 : #include <stdlib.h>
40 : #include <string.h> /* for memcmp() */
41 :
42 : #include "types.h" /* for byte and u32 typedefs */
43 : #include "g10lib.h"
44 : #include "cipher.h"
45 : #include "bufhelp.h"
46 : #include "cipher-selftest.h"
47 : #include "rijndael-internal.h"
48 : #include "./cipher-internal.h"
49 :
50 :
51 : #ifdef USE_SSSE3
52 :
53 :
54 : #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
55 : /* Prevent compiler from issuing SSE instructions between asm blocks. */
56 : # pragma GCC target("no-sse")
57 : #endif
58 :
59 :
60 : /* Two macros to be called prior and after the use of SSSE3
61 : instructions. There should be no external function calls between
62 : the use of these macros. There purpose is to make sure that the
63 : SSE registers are cleared and won't reveal any information about
64 : the key or the data. */
65 : #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
66 : # define SSSE3_STATE_SIZE (16 * 10)
67 : /* XMM6-XMM15 are callee-saved registers on WIN64. */
68 : # define vpaes_ssse3_prepare() \
69 : asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" \
70 : "movdqu %%xmm7, 1*16(%0)\n\t" \
71 : "movdqu %%xmm8, 2*16(%0)\n\t" \
72 : "movdqu %%xmm9, 3*16(%0)\n\t" \
73 : "movdqu %%xmm10, 4*16(%0)\n\t" \
74 : "movdqu %%xmm11, 5*16(%0)\n\t" \
75 : "movdqu %%xmm12, 6*16(%0)\n\t" \
76 : "movdqu %%xmm13, 7*16(%0)\n\t" \
77 : "movdqu %%xmm14, 8*16(%0)\n\t" \
78 : "movdqu %%xmm15, 9*16(%0)\n\t" \
79 : : \
80 : : "r" (ssse3_state) \
81 : : "memory" )
82 : # define vpaes_ssse3_cleanup() \
83 : asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \
84 : "pxor %%xmm1, %%xmm1 \n\t" \
85 : "pxor %%xmm2, %%xmm2 \n\t" \
86 : "pxor %%xmm3, %%xmm3 \n\t" \
87 : "pxor %%xmm4, %%xmm4 \n\t" \
88 : "pxor %%xmm5, %%xmm5 \n\t" \
89 : "movdqu 0*16(%0), %%xmm6 \n\t" \
90 : "movdqu 1*16(%0), %%xmm7 \n\t" \
91 : "movdqu 2*16(%0), %%xmm8 \n\t" \
92 : "movdqu 3*16(%0), %%xmm9 \n\t" \
93 : "movdqu 4*16(%0), %%xmm10 \n\t" \
94 : "movdqu 5*16(%0), %%xmm11 \n\t" \
95 : "movdqu 6*16(%0), %%xmm12 \n\t" \
96 : "movdqu 7*16(%0), %%xmm13 \n\t" \
97 : "movdqu 8*16(%0), %%xmm14 \n\t" \
98 : "movdqu 9*16(%0), %%xmm15 \n\t" \
99 : : \
100 : : "r" (ssse3_state) \
101 : : "memory" )
102 : #else
103 : # define SSSE3_STATE_SIZE 1
104 : # define vpaes_ssse3_prepare() (void)ssse3_state
105 : # define vpaes_ssse3_cleanup() \
106 : asm volatile ("pxor %%xmm0, %%xmm0 \n\t" \
107 : "pxor %%xmm1, %%xmm1 \n\t" \
108 : "pxor %%xmm2, %%xmm2 \n\t" \
109 : "pxor %%xmm3, %%xmm3 \n\t" \
110 : "pxor %%xmm4, %%xmm4 \n\t" \
111 : "pxor %%xmm5, %%xmm5 \n\t" \
112 : "pxor %%xmm6, %%xmm6 \n\t" \
113 : "pxor %%xmm7, %%xmm7 \n\t" \
114 : "pxor %%xmm8, %%xmm8 \n\t" \
115 : ::: "memory" )
116 : #endif
117 :
118 : #define vpaes_ssse3_prepare_enc(const_ptr) \
119 : vpaes_ssse3_prepare(); \
120 : asm volatile ("lea .Laes_consts(%%rip), %q0 \n\t" \
121 : "movdqa (%q0), %%xmm9 # 0F \n\t" \
122 : "movdqa .Lk_inv (%q0), %%xmm10 # inv \n\t" \
123 : "movdqa .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
124 : "movdqa .Lk_sb1 (%q0), %%xmm13 # sb1u \n\t" \
125 : "movdqa .Lk_sb1+16(%q0), %%xmm12 # sb1t \n\t" \
126 : "movdqa .Lk_sb2 (%q0), %%xmm15 # sb2u \n\t" \
127 : "movdqa .Lk_sb2+16(%q0), %%xmm14 # sb2t \n\t" \
128 : : "=c" (const_ptr) \
129 : : \
130 : : "memory" )
131 :
132 : #define vpaes_ssse3_prepare_dec(const_ptr) \
133 : vpaes_ssse3_prepare(); \
134 : asm volatile ("lea .Laes_consts(%%rip), %q0 \n\t" \
135 : "movdqa (%q0), %%xmm9 # 0F \n\t" \
136 : "movdqa .Lk_inv (%q0), %%xmm10 # inv \n\t" \
137 : "movdqa .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
138 : "movdqa .Lk_dsb9 (%q0), %%xmm13 # sb9u \n\t" \
139 : "movdqa .Lk_dsb9+16(%q0), %%xmm12 # sb9t \n\t" \
140 : "movdqa .Lk_dsbd (%q0), %%xmm15 # sbdu \n\t" \
141 : "movdqa .Lk_dsbb (%q0), %%xmm14 # sbbu \n\t" \
142 : "movdqa .Lk_dsbe (%q0), %%xmm8 # sbeu \n\t" \
143 : : "=c" (const_ptr) \
144 : : \
145 : : "memory" )
146 :
147 :
148 :
149 : void
150 0 : _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
151 : {
152 0 : unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
153 : byte ssse3_state[SSSE3_STATE_SIZE];
154 :
155 : vpaes_ssse3_prepare();
156 :
157 0 : asm volatile ("leaq %q[key], %%rdi" "\n\t"
158 : "movl %[bits], %%esi" "\n\t"
159 : "leaq %[buf], %%rdx" "\n\t"
160 : "movl %[dir], %%ecx" "\n\t"
161 : "movl %[rotoffs], %%r8d" "\n\t"
162 : "call _aes_schedule_core" "\n\t"
163 : :
164 : : [key] "m" (*key),
165 : [bits] "g" (keybits),
166 : [buf] "m" (ctx->keyschenc32[0][0]),
167 : [dir] "g" (0),
168 : [rotoffs] "g" (48)
169 : : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
170 : "cc", "memory");
171 :
172 0 : vpaes_ssse3_cleanup();
173 :
174 : /* Save key for setting up decryption. */
175 0 : memcpy(&ctx->keyschdec32[0][0], key, keybits / 8);
176 0 : }
177 :
178 :
179 : /* Make a decryption key from an encryption key. */
180 : void
181 0 : _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
182 : {
183 0 : unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
184 : byte ssse3_state[SSSE3_STATE_SIZE];
185 :
186 : vpaes_ssse3_prepare();
187 :
188 0 : asm volatile ("leaq %q[key], %%rdi" "\n\t"
189 : "movl %[bits], %%esi" "\n\t"
190 : "leaq %[buf], %%rdx" "\n\t"
191 : "movl %[dir], %%ecx" "\n\t"
192 : "movl %[rotoffs], %%r8d" "\n\t"
193 : "call _aes_schedule_core" "\n\t"
194 : :
195 : : [key] "m" (ctx->keyschdec32[0][0]),
196 : [bits] "g" (keybits),
197 0 : [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
198 : [dir] "g" (1),
199 0 : [rotoffs] "g" ((keybits == 192) ? 0 : 32)
200 : : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
201 : "cc", "memory");
202 :
203 0 : vpaes_ssse3_cleanup();
204 0 : }
205 :
206 :
207 : /* Encrypt one block using the Intel SSSE3 instructions. Block is input
208 : * and output through SSE register xmm0. */
209 : static inline void
210 0 : do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds,
211 : const void *aes_const_ptr)
212 : {
213 0 : unsigned int middle_rounds = nrounds - 1;
214 0 : const void *keysched = ctx->keyschenc32;
215 :
216 0 : asm volatile ("call _aes_encrypt_core" "\n\t"
217 : : "+a" (middle_rounds), "+d" (keysched)
218 : : "c" (aes_const_ptr)
219 : : "rdi", "rsi", "cc", "memory");
220 0 : }
221 :
222 :
223 : /* Decrypt one block using the Intel SSSE3 instructions. Block is input
224 : * and output through SSE register xmm0. */
225 : static inline void
226 0 : do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds,
227 : const void *aes_const_ptr)
228 : {
229 0 : unsigned int middle_rounds = nrounds - 1;
230 0 : const void *keysched = ctx->keyschdec32;
231 :
232 0 : asm volatile ("call _aes_decrypt_core" "\n\t"
233 : : "+a" (middle_rounds), "+d" (keysched)
234 : : "c" (aes_const_ptr)
235 : : "rsi", "cc", "memory");
236 0 : }
237 :
238 :
239 : unsigned int
240 0 : _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
241 : const unsigned char *src)
242 : {
243 0 : unsigned int nrounds = ctx->rounds;
244 : const void *aes_const_ptr;
245 : byte ssse3_state[SSSE3_STATE_SIZE];
246 :
247 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
248 0 : asm volatile ("movdqu %[src], %%xmm0\n\t"
249 : :
250 : : [src] "m" (*src)
251 : : "memory" );
252 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
253 0 : asm volatile ("movdqu %%xmm0, %[dst]\n\t"
254 : : [dst] "=m" (*dst)
255 : :
256 : : "memory" );
257 0 : vpaes_ssse3_cleanup ();
258 0 : return 0;
259 : }
260 :
261 :
262 : void
263 0 : _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
264 : const unsigned char *inbuf, unsigned char *iv,
265 : size_t nblocks)
266 : {
267 0 : unsigned int nrounds = ctx->rounds;
268 : const void *aes_const_ptr;
269 : byte ssse3_state[SSSE3_STATE_SIZE];
270 :
271 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
272 :
273 0 : asm volatile ("movdqu %[iv], %%xmm0\n\t"
274 : : /* No output */
275 : : [iv] "m" (*iv)
276 : : "memory" );
277 :
278 0 : for ( ;nblocks; nblocks-- )
279 : {
280 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
281 :
282 0 : asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
283 : "pxor %%xmm1, %%xmm0\n\t"
284 : "movdqu %%xmm0, %[outbuf]\n\t"
285 : : [outbuf] "=m" (*outbuf)
286 : : [inbuf] "m" (*inbuf)
287 : : "memory" );
288 :
289 0 : outbuf += BLOCKSIZE;
290 0 : inbuf += BLOCKSIZE;
291 : }
292 :
293 0 : asm volatile ("movdqu %%xmm0, %[iv]\n\t"
294 : : [iv] "=m" (*iv)
295 : :
296 : : "memory" );
297 :
298 0 : vpaes_ssse3_cleanup ();
299 0 : }
300 :
301 :
302 : void
303 0 : _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
304 : const unsigned char *inbuf, unsigned char *iv,
305 : size_t nblocks, int cbc_mac)
306 : {
307 0 : unsigned int nrounds = ctx->rounds;
308 : const void *aes_const_ptr;
309 : byte ssse3_state[SSSE3_STATE_SIZE];
310 :
311 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
312 :
313 0 : asm volatile ("movdqu %[iv], %%xmm7\n\t"
314 : : /* No output */
315 : : [iv] "m" (*iv)
316 : : "memory" );
317 :
318 0 : for ( ;nblocks; nblocks-- )
319 : {
320 0 : asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
321 : "pxor %%xmm7, %%xmm0\n\t"
322 : : /* No output */
323 : : [inbuf] "m" (*inbuf)
324 : : "memory" );
325 :
326 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
327 :
328 0 : asm volatile ("movdqa %%xmm0, %%xmm7\n\t"
329 : "movdqu %%xmm0, %[outbuf]\n\t"
330 : : [outbuf] "=m" (*outbuf)
331 : :
332 : : "memory" );
333 :
334 0 : inbuf += BLOCKSIZE;
335 0 : if (!cbc_mac)
336 0 : outbuf += BLOCKSIZE;
337 : }
338 :
339 0 : asm volatile ("movdqu %%xmm7, %[iv]\n\t"
340 : : [iv] "=m" (*iv)
341 : :
342 : : "memory" );
343 :
344 0 : vpaes_ssse3_cleanup ();
345 0 : }
346 :
347 :
348 : void
349 0 : _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
350 : const unsigned char *inbuf, unsigned char *ctr,
351 : size_t nblocks)
352 : {
353 : static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
354 : { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
355 0 : unsigned int nrounds = ctx->rounds;
356 : const void *aes_const_ptr;
357 : byte ssse3_state[SSSE3_STATE_SIZE];
358 : u64 ctrlow;
359 :
360 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
361 :
362 0 : asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
363 : "movdqa (%[ctr]), %%xmm7\n\t" /* Preload CTR */
364 : "movq 8(%[ctr]), %q[ctrlow]\n\t"
365 : "bswapq %q[ctrlow]\n\t"
366 : : [ctrlow] "=r" (ctrlow)
367 : : [mask] "m" (*be_mask),
368 : [ctr] "r" (ctr)
369 : : "memory", "cc");
370 :
371 0 : for ( ;nblocks; nblocks-- )
372 : {
373 0 : asm volatile ("movdqa %%xmm7, %%xmm0\n\t" /* xmm0 := CTR (xmm7) */
374 : "pcmpeqd %%xmm1, %%xmm1\n\t"
375 : "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
376 :
377 : "pshufb %%xmm6, %%xmm7\n\t"
378 : "psubq %%xmm1, %%xmm7\n\t" /* xmm7++ (big endian) */
379 :
380 : /* detect if 64-bit carry handling is needed */
381 : "incq %q[ctrlow]\n\t"
382 : "jnz .Lno_carry%=\n\t"
383 :
384 : "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
385 : "psubq %%xmm1, %%xmm7\n\t" /* add carry to upper 64bits */
386 :
387 : ".Lno_carry%=:\n\t"
388 :
389 : "pshufb %%xmm6, %%xmm7\n\t"
390 : :
391 : : [ctr] "r" (ctr), [ctrlow] "r" (ctrlow)
392 : : "cc", "memory");
393 :
394 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
395 :
396 0 : asm volatile ("movdqu %[src], %%xmm1\n\t" /* xmm1 := input */
397 : "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
398 : "movdqu %%xmm0, %[dst]" /* Store EncCTR. */
399 : : [dst] "=m" (*outbuf)
400 : : [src] "m" (*inbuf)
401 : : "memory");
402 :
403 0 : outbuf += BLOCKSIZE;
404 0 : inbuf += BLOCKSIZE;
405 : }
406 :
407 0 : asm volatile ("movdqu %%xmm7, %[ctr]\n\t" /* Update CTR (mem). */
408 : : [ctr] "=m" (*ctr)
409 : :
410 : : "memory" );
411 :
412 0 : vpaes_ssse3_cleanup ();
413 0 : }
414 :
415 :
416 : unsigned int
417 0 : _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
418 : const unsigned char *src)
419 : {
420 0 : unsigned int nrounds = ctx->rounds;
421 : const void *aes_const_ptr;
422 : byte ssse3_state[SSSE3_STATE_SIZE];
423 :
424 0 : vpaes_ssse3_prepare_dec (aes_const_ptr);
425 0 : asm volatile ("movdqu %[src], %%xmm0\n\t"
426 : :
427 : : [src] "m" (*src)
428 : : "memory" );
429 0 : do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
430 0 : asm volatile ("movdqu %%xmm0, %[dst]\n\t"
431 : : [dst] "=m" (*dst)
432 : :
433 : : "memory" );
434 0 : vpaes_ssse3_cleanup ();
435 0 : return 0;
436 : }
437 :
438 :
439 : void
440 0 : _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
441 : const unsigned char *inbuf, unsigned char *iv,
442 : size_t nblocks)
443 : {
444 0 : unsigned int nrounds = ctx->rounds;
445 : const void *aes_const_ptr;
446 : byte ssse3_state[SSSE3_STATE_SIZE];
447 :
448 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
449 :
450 0 : asm volatile ("movdqu %[iv], %%xmm0\n\t"
451 : : /* No output */
452 : : [iv] "m" (*iv)
453 : : "memory" );
454 :
455 0 : for ( ;nblocks; nblocks-- )
456 : {
457 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
458 :
459 0 : asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
460 : "movdqu %[inbuf], %%xmm0\n\t"
461 : "pxor %%xmm0, %%xmm6\n\t"
462 : "movdqu %%xmm6, %[outbuf]\n\t"
463 : : [outbuf] "=m" (*outbuf)
464 : : [inbuf] "m" (*inbuf)
465 : : "memory" );
466 :
467 0 : outbuf += BLOCKSIZE;
468 0 : inbuf += BLOCKSIZE;
469 : }
470 :
471 0 : asm volatile ("movdqu %%xmm0, %[iv]\n\t"
472 : : [iv] "=m" (*iv)
473 : :
474 : : "memory" );
475 :
476 0 : vpaes_ssse3_cleanup ();
477 0 : }
478 :
479 :
480 : void
481 0 : _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
482 : const unsigned char *inbuf, unsigned char *iv,
483 : size_t nblocks)
484 : {
485 0 : unsigned int nrounds = ctx->rounds;
486 : const void *aes_const_ptr;
487 : byte ssse3_state[SSSE3_STATE_SIZE];
488 :
489 0 : vpaes_ssse3_prepare_dec (aes_const_ptr);
490 :
491 0 : asm volatile
492 : ("movdqu %[iv], %%xmm7\n\t" /* use xmm7 as fast IV storage */
493 : : /* No output */
494 : : [iv] "m" (*iv)
495 : : "memory");
496 :
497 0 : for ( ;nblocks; nblocks-- )
498 : {
499 0 : asm volatile
500 : ("movdqu %[inbuf], %%xmm0\n\t"
501 : "movdqa %%xmm0, %%xmm6\n\t" /* use xmm6 as savebuf */
502 : : /* No output */
503 : : [inbuf] "m" (*inbuf)
504 : : "memory");
505 :
506 0 : do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
507 :
508 0 : asm volatile
509 : ("pxor %%xmm7, %%xmm0\n\t" /* xor IV with output */
510 : "movdqu %%xmm0, %[outbuf]\n\t"
511 : "movdqu %%xmm6, %%xmm7\n\t" /* store savebuf as new IV */
512 : : [outbuf] "=m" (*outbuf)
513 : :
514 : : "memory");
515 :
516 0 : outbuf += BLOCKSIZE;
517 0 : inbuf += BLOCKSIZE;
518 : }
519 :
520 0 : asm volatile
521 : ("movdqu %%xmm7, %[iv]\n\t" /* store IV */
522 : : /* No output */
523 : : [iv] "m" (*iv)
524 : : "memory");
525 :
526 0 : vpaes_ssse3_cleanup ();
527 0 : }
528 :
529 :
530 : static void
531 0 : ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
532 : const void *inbuf_arg, size_t nblocks)
533 : {
534 0 : RIJNDAEL_context *ctx = (void *)&c->context.c;
535 0 : unsigned char *outbuf = outbuf_arg;
536 0 : const unsigned char *inbuf = inbuf_arg;
537 0 : u64 n = c->u_mode.ocb.data_nblocks;
538 0 : unsigned int nrounds = ctx->rounds;
539 : const void *aes_const_ptr;
540 : byte ssse3_state[SSSE3_STATE_SIZE];
541 :
542 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
543 :
544 : /* Preload Offset and Checksum */
545 0 : asm volatile ("movdqu %[iv], %%xmm7\n\t"
546 : "movdqu %[ctr], %%xmm6\n\t"
547 : : /* No output */
548 : : [iv] "m" (*c->u_iv.iv),
549 : [ctr] "m" (*c->u_ctr.ctr)
550 : : "memory" );
551 :
552 0 : for ( ;nblocks; nblocks-- )
553 : {
554 : const unsigned char *l;
555 :
556 0 : l = ocb_get_l(c, ++n);
557 :
558 : /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
559 : /* Checksum_i = Checksum_{i-1} xor P_i */
560 : /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
561 0 : asm volatile ("movdqu %[l], %%xmm1\n\t"
562 : "movdqu %[inbuf], %%xmm0\n\t"
563 : "pxor %%xmm1, %%xmm7\n\t"
564 : "pxor %%xmm0, %%xmm6\n\t"
565 : "pxor %%xmm7, %%xmm0\n\t"
566 : :
567 : : [l] "m" (*l),
568 : [inbuf] "m" (*inbuf)
569 : : "memory" );
570 :
571 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
572 :
573 0 : asm volatile ("pxor %%xmm7, %%xmm0\n\t"
574 : "movdqu %%xmm0, %[outbuf]\n\t"
575 : : [outbuf] "=m" (*outbuf)
576 : :
577 : : "memory" );
578 :
579 0 : inbuf += BLOCKSIZE;
580 0 : outbuf += BLOCKSIZE;
581 : }
582 :
583 0 : c->u_mode.ocb.data_nblocks = n;
584 0 : asm volatile ("movdqu %%xmm7, %[iv]\n\t"
585 : "movdqu %%xmm6, %[ctr]\n\t"
586 : : [iv] "=m" (*c->u_iv.iv),
587 : [ctr] "=m" (*c->u_ctr.ctr)
588 : :
589 : : "memory" );
590 :
591 0 : vpaes_ssse3_cleanup ();
592 0 : }
593 :
594 : static void
595 0 : ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
596 : const void *inbuf_arg, size_t nblocks)
597 : {
598 0 : RIJNDAEL_context *ctx = (void *)&c->context.c;
599 0 : unsigned char *outbuf = outbuf_arg;
600 0 : const unsigned char *inbuf = inbuf_arg;
601 0 : u64 n = c->u_mode.ocb.data_nblocks;
602 0 : unsigned int nrounds = ctx->rounds;
603 : const void *aes_const_ptr;
604 : byte ssse3_state[SSSE3_STATE_SIZE];
605 :
606 0 : vpaes_ssse3_prepare_dec (aes_const_ptr);
607 :
608 : /* Preload Offset and Checksum */
609 0 : asm volatile ("movdqu %[iv], %%xmm7\n\t"
610 : "movdqu %[ctr], %%xmm6\n\t"
611 : : /* No output */
612 : : [iv] "m" (*c->u_iv.iv),
613 : [ctr] "m" (*c->u_ctr.ctr)
614 : : "memory" );
615 :
616 0 : for ( ;nblocks; nblocks-- )
617 : {
618 : const unsigned char *l;
619 :
620 0 : l = ocb_get_l(c, ++n);
621 :
622 : /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
623 : /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
624 : /* Checksum_i = Checksum_{i-1} xor P_i */
625 0 : asm volatile ("movdqu %[l], %%xmm1\n\t"
626 : "movdqu %[inbuf], %%xmm0\n\t"
627 : "pxor %%xmm1, %%xmm7\n\t"
628 : "pxor %%xmm7, %%xmm0\n\t"
629 : :
630 : : [l] "m" (*l),
631 : [inbuf] "m" (*inbuf)
632 : : "memory" );
633 :
634 0 : do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
635 :
636 0 : asm volatile ("pxor %%xmm7, %%xmm0\n\t"
637 : "pxor %%xmm0, %%xmm6\n\t"
638 : "movdqu %%xmm0, %[outbuf]\n\t"
639 : : [outbuf] "=m" (*outbuf)
640 : :
641 : : "memory" );
642 :
643 0 : inbuf += BLOCKSIZE;
644 0 : outbuf += BLOCKSIZE;
645 : }
646 :
647 0 : c->u_mode.ocb.data_nblocks = n;
648 0 : asm volatile ("movdqu %%xmm7, %[iv]\n\t"
649 : "movdqu %%xmm6, %[ctr]\n\t"
650 : : [iv] "=m" (*c->u_iv.iv),
651 : [ctr] "=m" (*c->u_ctr.ctr)
652 : :
653 : : "memory" );
654 :
655 0 : vpaes_ssse3_cleanup ();
656 0 : }
657 :
658 :
659 : void
660 0 : _gcry_aes_ssse3_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
661 : const void *inbuf_arg, size_t nblocks, int encrypt)
662 : {
663 0 : if (encrypt)
664 0 : ssse3_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
665 : else
666 0 : ssse3_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
667 0 : }
668 :
669 :
670 : void
671 0 : _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
672 : size_t nblocks)
673 : {
674 0 : RIJNDAEL_context *ctx = (void *)&c->context.c;
675 0 : const unsigned char *abuf = abuf_arg;
676 0 : u64 n = c->u_mode.ocb.aad_nblocks;
677 0 : unsigned int nrounds = ctx->rounds;
678 : const void *aes_const_ptr;
679 : byte ssse3_state[SSSE3_STATE_SIZE];
680 :
681 0 : vpaes_ssse3_prepare_enc (aes_const_ptr);
682 :
683 : /* Preload Offset and Sum */
684 0 : asm volatile ("movdqu %[iv], %%xmm7\n\t"
685 : "movdqu %[ctr], %%xmm6\n\t"
686 : : /* No output */
687 : : [iv] "m" (*c->u_mode.ocb.aad_offset),
688 : [ctr] "m" (*c->u_mode.ocb.aad_sum)
689 : : "memory" );
690 :
691 0 : for ( ;nblocks; nblocks-- )
692 : {
693 : const unsigned char *l;
694 :
695 0 : l = ocb_get_l(c, ++n);
696 :
697 : /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
698 : /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
699 0 : asm volatile ("movdqu %[l], %%xmm1\n\t"
700 : "movdqu %[abuf], %%xmm0\n\t"
701 : "pxor %%xmm1, %%xmm7\n\t"
702 : "pxor %%xmm7, %%xmm0\n\t"
703 : :
704 : : [l] "m" (*l),
705 : [abuf] "m" (*abuf)
706 : : "memory" );
707 :
708 0 : do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
709 :
710 0 : asm volatile ("pxor %%xmm0, %%xmm6\n\t"
711 : :
712 : :
713 : : "memory" );
714 :
715 0 : abuf += BLOCKSIZE;
716 : }
717 :
718 0 : c->u_mode.ocb.aad_nblocks = n;
719 0 : asm volatile ("movdqu %%xmm7, %[iv]\n\t"
720 : "movdqu %%xmm6, %[ctr]\n\t"
721 : : [iv] "=m" (*c->u_mode.ocb.aad_offset),
722 : [ctr] "=m" (*c->u_mode.ocb.aad_sum)
723 : :
724 : : "memory" );
725 :
726 0 : vpaes_ssse3_cleanup ();
727 0 : }
728 :
729 :
730 : #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
731 : # define X(...)
732 : #else
733 : # define X(...) __VA_ARGS__
734 : #endif
735 :
736 : asm (
737 : "\n\t" "##"
738 : "\n\t" "## Constant-time SSSE3 AES core implementation."
739 : "\n\t" "##"
740 : "\n\t" "## By Mike Hamburg (Stanford University), 2009"
741 : "\n\t" "## Public domain."
742 : "\n\t" "##"
743 :
744 : "\n\t" ".text"
745 :
746 : "\n\t" "##"
747 : "\n\t" "## _aes_encrypt_core"
748 : "\n\t" "##"
749 : "\n\t" "## AES-encrypt %xmm0."
750 : "\n\t" "##"
751 : "\n\t" "## Inputs:"
752 : "\n\t" "## %xmm0 = input"
753 : "\n\t" "## %xmm9-%xmm15 as in .Laes_preheat"
754 : "\n\t" "## %rcx = .Laes_consts"
755 : "\n\t" "## (%rdx) = scheduled keys"
756 : "\n\t" "## %rax = nrounds - 1"
757 : "\n\t" "##"
758 : "\n\t" "## Output in %xmm0"
759 : "\n\t" "## Clobbers %xmm1-%xmm4, %r9, %r11, %rax"
760 : "\n\t" "## Preserves %xmm6 - %xmm7 so you get some local vectors"
761 : "\n\t" "##"
762 : "\n\t" "##"
763 : "\n\t" ".align 16"
764 : X("\n\t" ".type _aes_encrypt_core,@function")
765 : "\n\t" "_aes_encrypt_core:"
766 : "\n\t" " leaq .Lk_mc_backward(%rcx), %rdi"
767 : "\n\t" " mov $16, %rsi"
768 : "\n\t" " movdqa .Lk_ipt (%rcx), %xmm2 # iptlo"
769 : "\n\t" " movdqa %xmm9, %xmm1"
770 : "\n\t" " pandn %xmm0, %xmm1"
771 : "\n\t" " psrld $4, %xmm1"
772 : "\n\t" " pand %xmm9, %xmm0"
773 : "\n\t" " pshufb %xmm0, %xmm2"
774 : "\n\t" " movdqa .Lk_ipt+16(%rcx), %xmm0 # ipthi"
775 : "\n\t" " pshufb %xmm1, %xmm0"
776 : "\n\t" " pxor (%rdx),%xmm2"
777 : "\n\t" " pxor %xmm2, %xmm0"
778 : "\n\t" " add $16, %rdx"
779 : "\n\t" " jmp .Laes_entry"
780 :
781 : "\n\t" ".align 8"
782 : "\n\t" ".Laes_loop:"
783 : "\n\t" " # middle of middle round"
784 : "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb1u"
785 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb1u"
786 : "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k"
787 : "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb1t"
788 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
789 : "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
790 : "\n\t" " movdqa %xmm15, %xmm4 # 4 : sb2u"
791 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb2u"
792 : "\n\t" " movdqa .Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1"
793 : "\n\t" " movdqa %xmm14, %xmm2 # 2 : sb2t"
794 : "\n\t" " pshufb %xmm3, %xmm2 # 2 = sb2t"
795 : "\n\t" " pxor %xmm4, %xmm2 # 2 = 2A"
796 : "\n\t" " movdqa %xmm0, %xmm3 # 3 = A"
797 : "\n\t" " pshufb %xmm1, %xmm0 # 0 = B"
798 : "\n\t" " pxor %xmm2, %xmm0 # 0 = 2A+B"
799 : "\n\t" " pshufb (%rsi,%rdi), %xmm3 # 3 = D"
800 : "\n\t" " lea 16(%esi),%esi # next mc"
801 : "\n\t" " pxor %xmm0, %xmm3 # 3 = 2A+B+D"
802 : "\n\t" " lea 16(%rdx),%rdx # next key"
803 : "\n\t" " pshufb %xmm1, %xmm0 # 0 = 2B+C"
804 : "\n\t" " pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D"
805 : "\n\t" " and $48, %rsi # ... mod 4"
806 : "\n\t" " dec %rax # nr--"
807 :
808 : "\n\t" ".Laes_entry:"
809 : "\n\t" " # top of round"
810 : "\n\t" " movdqa %xmm9, %xmm1 # 1 : i"
811 : "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4"
812 : "\n\t" " psrld $4, %xmm1 # 1 = i"
813 : "\n\t" " pand %xmm9, %xmm0 # 0 = k"
814 : "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k"
815 : "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
816 : "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
817 : "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i"
818 : "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
819 : "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
820 : "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j"
821 : "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
822 : "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
823 : "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak"
824 : "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
825 : "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
826 : "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak"
827 : "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
828 : "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
829 : "\n\t" " jnz .Laes_loop"
830 :
831 : "\n\t" " # middle of last round"
832 : "\n\t" " movdqa .Lk_sbo(%rcx), %xmm4 # 3 : sbou"
833 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
834 : "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k"
835 : "\n\t" " movdqa .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot"
836 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
837 : "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
838 : "\n\t" " pshufb .Lk_sr(%rsi,%rcx), %xmm0"
839 : "\n\t" " ret"
840 : X("\n\t" ".size _aes_encrypt_core,.-_aes_encrypt_core")
841 :
842 : "\n\t" "##"
843 : "\n\t" "## Decryption core"
844 : "\n\t" "##"
845 : "\n\t" "## Same API as encryption core."
846 : "\n\t" "##"
847 : "\n\t" ".align 16"
848 : X("\n\t" ".type _aes_decrypt_core,@function")
849 : "\n\t" "_aes_decrypt_core:"
850 : "\n\t" " movl %eax, %esi"
851 : "\n\t" " shll $4, %esi"
852 : "\n\t" " xorl $48, %esi"
853 : "\n\t" " andl $48, %esi"
854 : "\n\t" " movdqa .Lk_dipt (%rcx), %xmm2 # iptlo"
855 : "\n\t" " movdqa %xmm9, %xmm1"
856 : "\n\t" " pandn %xmm0, %xmm1"
857 : "\n\t" " psrld $4, %xmm1"
858 : "\n\t" " pand %xmm9, %xmm0"
859 : "\n\t" " pshufb %xmm0, %xmm2"
860 : "\n\t" " movdqa .Lk_dipt+16(%rcx), %xmm0 # ipthi"
861 : "\n\t" " pshufb %xmm1, %xmm0"
862 : "\n\t" " pxor (%rdx), %xmm2"
863 : "\n\t" " pxor %xmm2, %xmm0"
864 : "\n\t" " movdqa .Lk_mc_forward+48(%rcx), %xmm5"
865 : "\n\t" " lea 16(%rdx), %rdx"
866 : "\n\t" " neg %rax"
867 : "\n\t" " jmp .Laes_dec_entry"
868 :
869 : "\n\t" ".align 16"
870 : "\n\t" ".Laes_dec_loop:"
871 : "\n\t" "##"
872 : "\n\t" "## Inverse mix columns"
873 : "\n\t" "##"
874 : "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb9u"
875 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb9u"
876 : "\n\t" " pxor (%rdx), %xmm4"
877 : "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb9t"
878 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb9t"
879 : "\n\t" " movdqa .Lk_dsbd+16(%rcx),%xmm1 # 1 : sbdt"
880 : "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
881 : "\n\t" " lea 16(%rdx), %rdx # next round key"
882 :
883 : "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
884 : "\n\t" " movdqa %xmm15, %xmm4 # 4 : sbdu"
885 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbdu"
886 : "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
887 : "\n\t" " pshufb %xmm3, %xmm1 # 1 = sbdt"
888 : "\n\t" " pxor %xmm4, %xmm1 # 1 = ch"
889 :
890 : "\n\t" " pshufb %xmm5, %xmm1 # MC ch"
891 : "\n\t" " movdqa %xmm14, %xmm4 # 4 : sbbu"
892 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbbu"
893 : "\n\t" " inc %rax # nr--"
894 : "\n\t" " pxor %xmm1, %xmm4 # 4 = ch"
895 : "\n\t" " movdqa .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt"
896 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbbt"
897 : "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
898 :
899 : "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
900 : "\n\t" " movdqa %xmm8, %xmm4 # 4 : sbeu"
901 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbeu"
902 : "\n\t" " pshufd $0x93, %xmm5, %xmm5"
903 : "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
904 : "\n\t" " movdqa .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet"
905 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbet"
906 : "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
907 :
908 : "\n\t" ".Laes_dec_entry:"
909 : "\n\t" " # top of round"
910 : "\n\t" " movdqa %xmm9, %xmm1 # 1 : i"
911 : "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4"
912 : "\n\t" " psrld $4, %xmm1 # 1 = i"
913 : "\n\t" " pand %xmm9, %xmm0 # 0 = k"
914 : "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k"
915 : "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
916 : "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
917 : "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i"
918 : "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
919 : "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
920 : "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j"
921 : "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
922 : "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
923 : "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak"
924 : "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
925 : "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
926 : "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak"
927 : "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
928 : "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
929 : "\n\t" " jnz .Laes_dec_loop"
930 :
931 : "\n\t" " # middle of last round"
932 : "\n\t" " movdqa .Lk_dsbo(%rcx), %xmm4 # 3 : sbou"
933 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
934 : "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k"
935 : "\n\t" " movdqa .Lk_dsbo+16(%rcx), %xmm0 # 0 : sbot"
936 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
937 : "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
938 : "\n\t" " pshufb .Lk_sr(%rsi,%rcx), %xmm0"
939 : "\n\t" " ret"
940 : X("\n\t" ".size _aes_decrypt_core,.-_aes_decrypt_core")
941 :
942 : "\n\t" "########################################################"
943 : "\n\t" "## ##"
944 : "\n\t" "## AES key schedule ##"
945 : "\n\t" "## ##"
946 : "\n\t" "########################################################"
947 :
948 : "\n\t" ".align 16"
949 : X("\n\t" ".type _aes_schedule_core,@function")
950 : "\n\t" "_aes_schedule_core:"
951 : "\n\t" " # rdi = key"
952 : "\n\t" " # rsi = size in bits"
953 : "\n\t" " # rdx = buffer"
954 : "\n\t" " # rcx = direction. 0=encrypt, 1=decrypt"
955 :
956 : "\n\t" " # load the tables"
957 : "\n\t" " lea .Laes_consts(%rip), %r10"
958 : "\n\t" " movdqa (%r10), %xmm9 # 0F"
959 : "\n\t" " movdqa .Lk_inv (%r10), %xmm10 # inv"
960 : "\n\t" " movdqa .Lk_inv+16(%r10), %xmm11 # inva"
961 : "\n\t" " movdqa .Lk_sb1 (%r10), %xmm13 # sb1u"
962 : "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm12 # sb1t"
963 : "\n\t" " movdqa .Lk_sb2 (%r10), %xmm15 # sb2u"
964 : "\n\t" " movdqa .Lk_sb2+16(%r10), %xmm14 # sb2t"
965 :
966 : "\n\t" " movdqa .Lk_rcon(%r10), %xmm8 # load rcon"
967 : "\n\t" " movdqu (%rdi), %xmm0 # load key (unaligned)"
968 :
969 : "\n\t" " # input transform"
970 : "\n\t" " movdqu %xmm0, %xmm3"
971 : "\n\t" " lea .Lk_ipt(%r10), %r11"
972 : "\n\t" " call .Laes_schedule_transform"
973 : "\n\t" " movdqu %xmm0, %xmm7"
974 :
975 : "\n\t" " test %rcx, %rcx"
976 : "\n\t" " jnz .Laes_schedule_am_decrypting"
977 :
978 : "\n\t" " # encrypting, output zeroth round key after transform"
979 : "\n\t" " movdqa %xmm0, (%rdx)"
980 : "\n\t" " jmp .Laes_schedule_go"
981 :
982 : "\n\t" ".Laes_schedule_am_decrypting:"
983 : "\n\t" " # decrypting, output zeroth round key after shiftrows"
984 : "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3"
985 : "\n\t" " movdqa %xmm3, (%rdx)"
986 : "\n\t" " xor $48, %r8"
987 :
988 : "\n\t" ".Laes_schedule_go:"
989 : "\n\t" " cmp $192, %rsi"
990 : "\n\t" " je .Laes_schedule_192"
991 : "\n\t" " cmp $256, %rsi"
992 : "\n\t" " je .Laes_schedule_256"
993 : "\n\t" " # 128: fall though"
994 :
995 : "\n\t" "##"
996 : "\n\t" "## .Laes_schedule_128"
997 : "\n\t" "##"
998 : "\n\t" "## 128-bit specific part of key schedule."
999 : "\n\t" "##"
1000 : "\n\t" "## This schedule is really simple, because all its parts"
1001 : "\n\t" "## are accomplished by the subroutines."
1002 : "\n\t" "##"
1003 : "\n\t" ".Laes_schedule_128:"
1004 : "\n\t" " mov $10, %rsi"
1005 :
1006 : "\n\t" ".Laes_schedule_128_L:"
1007 : "\n\t" " call .Laes_schedule_round"
1008 : "\n\t" " dec %rsi"
1009 : "\n\t" " jz .Laes_schedule_mangle_last"
1010 : "\n\t" " call .Laes_schedule_mangle # write output"
1011 : "\n\t" " jmp .Laes_schedule_128_L"
1012 :
1013 : "\n\t" "##"
1014 : "\n\t" "## .Laes_schedule_192"
1015 : "\n\t" "##"
1016 : "\n\t" "## 192-bit specific part of key schedule."
1017 : "\n\t" "##"
1018 : "\n\t" "## The main body of this schedule is the same as the 128-bit"
1019 : "\n\t" "## schedule, but with more smearing. The long, high side is"
1020 : "\n\t" "## stored in %xmm7 as before, and the short, low side is in"
1021 : "\n\t" "## the high bits of %xmm6."
1022 : "\n\t" "##"
1023 : "\n\t" "## This schedule is somewhat nastier, however, because each"
1024 : "\n\t" "## round produces 192 bits of key material, or 1.5 round keys."
1025 : "\n\t" "## Therefore, on each cycle we do 2 rounds and produce 3 round"
1026 : "\n\t" "## keys."
1027 : "\n\t" "##"
1028 : "\n\t" ".Laes_schedule_192:"
1029 : "\n\t" " movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)"
1030 : "\n\t" " call .Laes_schedule_transform # input transform"
1031 : "\n\t" " pshufd $0x0E, %xmm0, %xmm6"
1032 : "\n\t" " pslldq $8, %xmm6 # clobber low side with zeros"
1033 : "\n\t" " mov $4, %rsi"
1034 :
1035 : "\n\t" ".Laes_schedule_192_L:"
1036 : "\n\t" " call .Laes_schedule_round"
1037 : "\n\t" " palignr $8,%xmm6,%xmm0 "
1038 : "\n\t" " call .Laes_schedule_mangle # save key n"
1039 : "\n\t" " call .Laes_schedule_192_smear"
1040 : "\n\t" " call .Laes_schedule_mangle # save key n+1"
1041 : "\n\t" " call .Laes_schedule_round"
1042 : "\n\t" " dec %rsi"
1043 : "\n\t" " jz .Laes_schedule_mangle_last"
1044 : "\n\t" " call .Laes_schedule_mangle # save key n+2"
1045 : "\n\t" " call .Laes_schedule_192_smear"
1046 : "\n\t" " jmp .Laes_schedule_192_L"
1047 :
1048 : "\n\t" "##"
1049 : "\n\t" "## .Laes_schedule_192_smear"
1050 : "\n\t" "##"
1051 : "\n\t" "## Smear the short, low side in the 192-bit key schedule."
1052 : "\n\t" "##"
1053 : "\n\t" "## Inputs:"
1054 : "\n\t" "## %xmm7: high side, b a x y"
1055 : "\n\t" "## %xmm6: low side, d c 0 0"
1056 : "\n\t" "## %xmm13: 0"
1057 : "\n\t" "##"
1058 : "\n\t" "## Outputs:"
1059 : "\n\t" "## %xmm6: b+c+d b+c 0 0"
1060 : "\n\t" "## %xmm0: b+c+d b+c b a"
1061 : "\n\t" "##"
1062 : "\n\t" ".Laes_schedule_192_smear:"
1063 : "\n\t" " pshufd $0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0"
1064 : "\n\t" " pxor %xmm0, %xmm6 # -> c+d c 0 0"
1065 : "\n\t" " pshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a"
1066 : "\n\t" " pxor %xmm6, %xmm0 # -> b+c+d b+c b a"
1067 : "\n\t" " pshufd $0x0E, %xmm0, %xmm6"
1068 : "\n\t" " pslldq $8, %xmm6 # clobber low side with zeros"
1069 : "\n\t" " ret"
1070 :
1071 : "\n\t" "##"
1072 : "\n\t" "## .Laes_schedule_256"
1073 : "\n\t" "##"
1074 : "\n\t" "## 256-bit specific part of key schedule."
1075 : "\n\t" "##"
1076 : "\n\t" "## The structure here is very similar to the 128-bit"
1077 : "\n\t" "## schedule, but with an additional 'low side' in"
1078 : "\n\t" "## %xmm6. The low side's rounds are the same as the"
1079 : "\n\t" "## high side's, except no rcon and no rotation."
1080 : "\n\t" "##"
1081 : "\n\t" ".Laes_schedule_256:"
1082 : "\n\t" " movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)"
1083 : "\n\t" " call .Laes_schedule_transform # input transform"
1084 : "\n\t" " mov $7, %rsi"
1085 :
1086 : "\n\t" ".Laes_schedule_256_L:"
1087 : "\n\t" " call .Laes_schedule_mangle # output low result"
1088 : "\n\t" " movdqa %xmm0, %xmm6 # save cur_lo in xmm6"
1089 :
1090 : "\n\t" " # high round"
1091 : "\n\t" " call .Laes_schedule_round"
1092 : "\n\t" " dec %rsi"
1093 : "\n\t" " jz .Laes_schedule_mangle_last"
1094 : "\n\t" " call .Laes_schedule_mangle "
1095 :
1096 : "\n\t" " # low round. swap xmm7 and xmm6"
1097 : "\n\t" " pshufd $0xFF, %xmm0, %xmm0"
1098 : "\n\t" " movdqa %xmm7, %xmm5"
1099 : "\n\t" " movdqa %xmm6, %xmm7"
1100 : "\n\t" " call .Laes_schedule_low_round"
1101 : "\n\t" " movdqa %xmm5, %xmm7"
1102 :
1103 : "\n\t" " jmp .Laes_schedule_256_L"
1104 :
1105 : "\n\t" "##"
1106 : "\n\t" "## .Laes_schedule_round"
1107 : "\n\t" "##"
1108 : "\n\t" "## Runs one main round of the key schedule on %xmm0, %xmm7"
1109 : "\n\t" "##"
1110 : "\n\t" "## Specifically, runs subbytes on the high dword of %xmm0"
1111 : "\n\t" "## then rotates it by one byte and xors into the low dword of"
1112 : "\n\t" "## %xmm7."
1113 : "\n\t" "##"
1114 : "\n\t" "## Adds rcon from low byte of %xmm8, then rotates %xmm8 for"
1115 : "\n\t" "## next rcon."
1116 : "\n\t" "##"
1117 : "\n\t" "## Smears the dwords of %xmm7 by xoring the low into the"
1118 : "\n\t" "## second low, result into third, result into highest."
1119 : "\n\t" "##"
1120 : "\n\t" "## Returns results in %xmm7 = %xmm0."
1121 : "\n\t" "## Clobbers %xmm1-%xmm4, %r11."
1122 : "\n\t" "##"
1123 : "\n\t" ".Laes_schedule_round:"
1124 : "\n\t" " # extract rcon from xmm8"
1125 : "\n\t" " pxor %xmm1, %xmm1"
1126 : "\n\t" " palignr $15, %xmm8, %xmm1"
1127 : "\n\t" " palignr $15, %xmm8, %xmm8"
1128 : "\n\t" " pxor %xmm1, %xmm7"
1129 :
1130 : "\n\t" " # rotate"
1131 : "\n\t" " pshufd $0xFF, %xmm0, %xmm0"
1132 : "\n\t" " palignr $1, %xmm0, %xmm0"
1133 :
1134 : "\n\t" " # fall through..."
1135 :
1136 : "\n\t" " # low round: same as high round, but no rotation and no rcon."
1137 : "\n\t" ".Laes_schedule_low_round:"
1138 : "\n\t" " # smear xmm7"
1139 : "\n\t" " movdqa %xmm7, %xmm1"
1140 : "\n\t" " pslldq $4, %xmm7"
1141 : "\n\t" " pxor %xmm1, %xmm7"
1142 : "\n\t" " movdqa %xmm7, %xmm1"
1143 : "\n\t" " pslldq $8, %xmm7"
1144 : "\n\t" " pxor %xmm1, %xmm7"
1145 : "\n\t" " pxor .Lk_s63(%r10), %xmm7"
1146 :
1147 : "\n\t" " # subbytes"
1148 : "\n\t" " movdqa %xmm9, %xmm1"
1149 : "\n\t" " pandn %xmm0, %xmm1"
1150 : "\n\t" " psrld $4, %xmm1 # 1 = i"
1151 : "\n\t" " pand %xmm9, %xmm0 # 0 = k"
1152 : "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k"
1153 : "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
1154 : "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
1155 : "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i"
1156 : "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
1157 : "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
1158 : "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j"
1159 : "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
1160 : "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
1161 : "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak"
1162 : "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
1163 : "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
1164 : "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak"
1165 : "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
1166 : "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
1167 : "\n\t" " movdqa .Lk_sb1(%r10), %xmm4 # 4 : sbou"
1168 : "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
1169 : "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm0 # 0 : sbot"
1170 : "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
1171 : "\n\t" " pxor %xmm4, %xmm0 # 0 = sbox output"
1172 :
1173 : "\n\t" " # add in smeared stuff"
1174 : "\n\t" " pxor %xmm7, %xmm0 "
1175 : "\n\t" " movdqa %xmm0, %xmm7"
1176 : "\n\t" " ret"
1177 :
1178 : "\n\t" "##"
1179 : "\n\t" "## .Laes_schedule_transform"
1180 : "\n\t" "##"
1181 : "\n\t" "## Linear-transform %xmm0 according to tables at (%r11)"
1182 : "\n\t" "##"
1183 : "\n\t" "## Requires that %xmm9 = 0x0F0F... as in preheat"
1184 : "\n\t" "## Output in %xmm0"
1185 : "\n\t" "## Clobbers %xmm1, %xmm2"
1186 : "\n\t" "##"
1187 : "\n\t" ".Laes_schedule_transform:"
1188 : "\n\t" " movdqa %xmm9, %xmm1"
1189 : "\n\t" " pandn %xmm0, %xmm1"
1190 : "\n\t" " psrld $4, %xmm1"
1191 : "\n\t" " pand %xmm9, %xmm0"
1192 : "\n\t" " movdqa (%r11), %xmm2 # lo"
1193 : "\n\t" " pshufb %xmm0, %xmm2"
1194 : "\n\t" " movdqa 16(%r11), %xmm0 # hi"
1195 : "\n\t" " pshufb %xmm1, %xmm0"
1196 : "\n\t" " pxor %xmm2, %xmm0"
1197 : "\n\t" " ret"
1198 :
1199 : "\n\t" "##"
1200 : "\n\t" "## .Laes_schedule_mangle"
1201 : "\n\t" "##"
1202 : "\n\t" "## Mangle xmm0 from (basis-transformed) standard version"
1203 : "\n\t" "## to our version."
1204 : "\n\t" "##"
1205 : "\n\t" "## On encrypt,"
1206 : "\n\t" "## xor with 0x63"
1207 : "\n\t" "## multiply by circulant 0,1,1,1"
1208 : "\n\t" "## apply shiftrows transform"
1209 : "\n\t" "##"
1210 : "\n\t" "## On decrypt,"
1211 : "\n\t" "## xor with 0x63"
1212 : "\n\t" "## multiply by 'inverse mixcolumns' circulant E,B,D,9"
1213 : "\n\t" "## deskew"
1214 : "\n\t" "## apply shiftrows transform"
1215 : "\n\t" "##"
1216 : "\n\t" "##"
1217 : "\n\t" "## Writes out to (%rdx), and increments or decrements it"
1218 : "\n\t" "## Keeps track of round number mod 4 in %r8"
1219 : "\n\t" "## Preserves xmm0"
1220 : "\n\t" "## Clobbers xmm1-xmm5"
1221 : "\n\t" "##"
1222 : "\n\t" ".Laes_schedule_mangle:"
1223 : "\n\t" " movdqa %xmm0, %xmm4 # save xmm0 for later"
1224 : "\n\t" " movdqa .Lk_mc_forward(%r10),%xmm5"
1225 : "\n\t" " test %rcx, %rcx"
1226 : "\n\t" " jnz .Laes_schedule_mangle_dec"
1227 :
1228 : "\n\t" " # encrypting"
1229 : "\n\t" " add $16, %rdx"
1230 : "\n\t" " pxor .Lk_s63(%r10),%xmm4"
1231 : "\n\t" " pshufb %xmm5, %xmm4"
1232 : "\n\t" " movdqa %xmm4, %xmm3"
1233 : "\n\t" " pshufb %xmm5, %xmm4"
1234 : "\n\t" " pxor %xmm4, %xmm3"
1235 : "\n\t" " pshufb %xmm5, %xmm4"
1236 : "\n\t" " pxor %xmm4, %xmm3"
1237 :
1238 : "\n\t" " jmp .Laes_schedule_mangle_both"
1239 :
1240 : "\n\t" ".Laes_schedule_mangle_dec:"
1241 : "\n\t" " lea .Lk_dks_1(%r10), %r11 # first table: *9"
1242 : "\n\t" " call .Laes_schedule_transform"
1243 : "\n\t" " movdqa %xmm0, %xmm3"
1244 : "\n\t" " pshufb %xmm5, %xmm3"
1245 :
1246 : "\n\t" " add $32, %r11 # next table: *B"
1247 : "\n\t" " call .Laes_schedule_transform"
1248 : "\n\t" " pxor %xmm0, %xmm3"
1249 : "\n\t" " pshufb %xmm5, %xmm3"
1250 :
1251 : "\n\t" " add $32, %r11 # next table: *D"
1252 : "\n\t" " call .Laes_schedule_transform"
1253 : "\n\t" " pxor %xmm0, %xmm3"
1254 : "\n\t" " pshufb %xmm5, %xmm3"
1255 :
1256 : "\n\t" " add $32, %r11 # next table: *E"
1257 : "\n\t" " call .Laes_schedule_transform"
1258 : "\n\t" " pxor %xmm0, %xmm3"
1259 : "\n\t" " pshufb %xmm5, %xmm3"
1260 :
1261 : "\n\t" " movdqa %xmm4, %xmm0 # restore %xmm0"
1262 : "\n\t" " add $-16, %rdx"
1263 :
1264 : "\n\t" ".Laes_schedule_mangle_both:"
1265 : "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3"
1266 : "\n\t" " add $-16, %r8"
1267 : "\n\t" " and $48, %r8"
1268 : "\n\t" " movdqa %xmm3, (%rdx)"
1269 : "\n\t" " ret"
1270 :
1271 : "\n\t" "##"
1272 : "\n\t" "## .Laes_schedule_mangle_last"
1273 : "\n\t" "##"
1274 : "\n\t" "## Mangler for last round of key schedule"
1275 : "\n\t" "## Mangles %xmm0"
1276 : "\n\t" "## when encrypting, outputs out(%xmm0) ^ 63"
1277 : "\n\t" "## when decrypting, outputs unskew(%xmm0)"
1278 : "\n\t" "##"
1279 : "\n\t" "## Always called right before return... jumps to cleanup and exits"
1280 : "\n\t" "##"
1281 : "\n\t" ".Laes_schedule_mangle_last:"
1282 : "\n\t" " # schedule last round key from xmm0"
1283 : "\n\t" " lea .Lk_deskew(%r10),%r11 # prepare to deskew"
1284 : "\n\t" " test %rcx, %rcx"
1285 : "\n\t" " jnz .Laes_schedule_mangle_last_dec"
1286 :
1287 : "\n\t" " # encrypting"
1288 : "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm0 # output permute"
1289 : "\n\t" " lea .Lk_opt(%r10), %r11 # prepare to output transform"
1290 : "\n\t" " add $32, %rdx"
1291 :
1292 : "\n\t" ".Laes_schedule_mangle_last_dec:"
1293 : "\n\t" " add $-16, %rdx"
1294 : "\n\t" " pxor .Lk_s63(%r10), %xmm0"
1295 : "\n\t" " call .Laes_schedule_transform # output transform"
1296 : "\n\t" " movdqa %xmm0, (%rdx) # save last key"
1297 :
1298 : "\n\t" " #_aes_cleanup"
1299 : "\n\t" " pxor %xmm0, %xmm0"
1300 : "\n\t" " pxor %xmm1, %xmm1"
1301 : "\n\t" " pxor %xmm2, %xmm2"
1302 : "\n\t" " pxor %xmm3, %xmm3"
1303 : "\n\t" " pxor %xmm4, %xmm4"
1304 : "\n\t" " pxor %xmm5, %xmm5"
1305 : "\n\t" " pxor %xmm6, %xmm6"
1306 : "\n\t" " pxor %xmm7, %xmm7"
1307 : "\n\t" " pxor %xmm8, %xmm8"
1308 : "\n\t" " ret"
1309 : X("\n\t" ".size _aes_schedule_core,.-_aes_schedule_core")
1310 :
1311 : "\n\t" "########################################################"
1312 : "\n\t" "## ##"
1313 : "\n\t" "## Constants ##"
1314 : "\n\t" "## ##"
1315 : "\n\t" "########################################################"
1316 :
1317 : "\n\t" ".align 16"
1318 : X("\n\t" ".type _aes_consts,@object")
1319 : "\n\t" ".Laes_consts:"
1320 : "\n\t" "_aes_consts:"
1321 : "\n\t" " # s0F"
1322 : "\n\t" " .Lk_s0F = .-.Laes_consts"
1323 : "\n\t" " .quad 0x0F0F0F0F0F0F0F0F"
1324 : "\n\t" " .quad 0x0F0F0F0F0F0F0F0F"
1325 :
1326 : "\n\t" " # input transform (lo, hi)"
1327 : "\n\t" " .Lk_ipt = .-.Laes_consts"
1328 : "\n\t" " .quad 0xC2B2E8985A2A7000"
1329 : "\n\t" " .quad 0xCABAE09052227808"
1330 : "\n\t" " .quad 0x4C01307D317C4D00"
1331 : "\n\t" " .quad 0xCD80B1FCB0FDCC81"
1332 :
1333 : "\n\t" " # inv, inva"
1334 : "\n\t" " .Lk_inv = .-.Laes_consts"
1335 : "\n\t" " .quad 0x0E05060F0D080180"
1336 : "\n\t" " .quad 0x040703090A0B0C02"
1337 : "\n\t" " .quad 0x01040A060F0B0780"
1338 : "\n\t" " .quad 0x030D0E0C02050809"
1339 :
1340 : "\n\t" " # sb1u, sb1t"
1341 : "\n\t" " .Lk_sb1 = .-.Laes_consts"
1342 : "\n\t" " .quad 0xB19BE18FCB503E00"
1343 : "\n\t" " .quad 0xA5DF7A6E142AF544"
1344 : "\n\t" " .quad 0x3618D415FAE22300"
1345 : "\n\t" " .quad 0x3BF7CCC10D2ED9EF"
1346 :
1347 :
1348 : "\n\t" " # sb2u, sb2t"
1349 : "\n\t" " .Lk_sb2 = .-.Laes_consts"
1350 : "\n\t" " .quad 0xE27A93C60B712400"
1351 : "\n\t" " .quad 0x5EB7E955BC982FCD"
1352 : "\n\t" " .quad 0x69EB88400AE12900"
1353 : "\n\t" " .quad 0xC2A163C8AB82234A"
1354 :
1355 : "\n\t" " # sbou, sbot"
1356 : "\n\t" " .Lk_sbo = .-.Laes_consts"
1357 : "\n\t" " .quad 0xD0D26D176FBDC700"
1358 : "\n\t" " .quad 0x15AABF7AC502A878"
1359 : "\n\t" " .quad 0xCFE474A55FBB6A00"
1360 : "\n\t" " .quad 0x8E1E90D1412B35FA"
1361 :
1362 : "\n\t" " # mc_forward"
1363 : "\n\t" " .Lk_mc_forward = .-.Laes_consts"
1364 : "\n\t" " .quad 0x0407060500030201"
1365 : "\n\t" " .quad 0x0C0F0E0D080B0A09"
1366 : "\n\t" " .quad 0x080B0A0904070605"
1367 : "\n\t" " .quad 0x000302010C0F0E0D"
1368 : "\n\t" " .quad 0x0C0F0E0D080B0A09"
1369 : "\n\t" " .quad 0x0407060500030201"
1370 : "\n\t" " .quad 0x000302010C0F0E0D"
1371 : "\n\t" " .quad 0x080B0A0904070605"
1372 :
1373 : "\n\t" " # mc_backward"
1374 : "\n\t" " .Lk_mc_backward = .-.Laes_consts"
1375 : "\n\t" " .quad 0x0605040702010003"
1376 : "\n\t" " .quad 0x0E0D0C0F0A09080B"
1377 : "\n\t" " .quad 0x020100030E0D0C0F"
1378 : "\n\t" " .quad 0x0A09080B06050407"
1379 : "\n\t" " .quad 0x0E0D0C0F0A09080B"
1380 : "\n\t" " .quad 0x0605040702010003"
1381 : "\n\t" " .quad 0x0A09080B06050407"
1382 : "\n\t" " .quad 0x020100030E0D0C0F"
1383 :
1384 : "\n\t" " # sr"
1385 : "\n\t" " .Lk_sr = .-.Laes_consts"
1386 : "\n\t" " .quad 0x0706050403020100"
1387 : "\n\t" " .quad 0x0F0E0D0C0B0A0908"
1388 : "\n\t" " .quad 0x030E09040F0A0500"
1389 : "\n\t" " .quad 0x0B06010C07020D08"
1390 : "\n\t" " .quad 0x0F060D040B020900"
1391 : "\n\t" " .quad 0x070E050C030A0108"
1392 : "\n\t" " .quad 0x0B0E0104070A0D00"
1393 : "\n\t" " .quad 0x0306090C0F020508"
1394 :
1395 : "\n\t" " # rcon"
1396 : "\n\t" " .Lk_rcon = .-.Laes_consts"
1397 : "\n\t" " .quad 0x1F8391B9AF9DEEB6"
1398 : "\n\t" " .quad 0x702A98084D7C7D81"
1399 :
1400 : "\n\t" " # s63: all equal to 0x63 transformed"
1401 : "\n\t" " .Lk_s63 = .-.Laes_consts"
1402 : "\n\t" " .quad 0x5B5B5B5B5B5B5B5B"
1403 : "\n\t" " .quad 0x5B5B5B5B5B5B5B5B"
1404 :
1405 : "\n\t" " # output transform"
1406 : "\n\t" " .Lk_opt = .-.Laes_consts"
1407 : "\n\t" " .quad 0xFF9F4929D6B66000"
1408 : "\n\t" " .quad 0xF7974121DEBE6808"
1409 : "\n\t" " .quad 0x01EDBD5150BCEC00"
1410 : "\n\t" " .quad 0xE10D5DB1B05C0CE0"
1411 :
1412 : "\n\t" " # deskew tables: inverts the sbox's 'skew'"
1413 : "\n\t" " .Lk_deskew = .-.Laes_consts"
1414 : "\n\t" " .quad 0x07E4A34047A4E300"
1415 : "\n\t" " .quad 0x1DFEB95A5DBEF91A"
1416 : "\n\t" " .quad 0x5F36B5DC83EA6900"
1417 : "\n\t" " .quad 0x2841C2ABF49D1E77"
1418 :
1419 : "\n\t" "##"
1420 : "\n\t" "## Decryption stuff"
1421 : "\n\t" "## Key schedule constants"
1422 : "\n\t" "##"
1423 : "\n\t" " # decryption key schedule: x -> invskew x*9"
1424 : "\n\t" " .Lk_dks_1 = .-.Laes_consts"
1425 : "\n\t" " .quad 0xB6116FC87ED9A700"
1426 : "\n\t" " .quad 0x4AED933482255BFC"
1427 : "\n\t" " .quad 0x4576516227143300"
1428 : "\n\t" " .quad 0x8BB89FACE9DAFDCE"
1429 :
1430 : "\n\t" " # decryption key schedule: invskew x*9 -> invskew x*D"
1431 : "\n\t" " .Lk_dks_2 = .-.Laes_consts"
1432 : "\n\t" " .quad 0x27438FEBCCA86400"
1433 : "\n\t" " .quad 0x4622EE8AADC90561"
1434 : "\n\t" " .quad 0x815C13CE4F92DD00"
1435 : "\n\t" " .quad 0x73AEE13CBD602FF2"
1436 :
1437 : "\n\t" " # decryption key schedule: invskew x*D -> invskew x*B"
1438 : "\n\t" " .Lk_dks_3 = .-.Laes_consts"
1439 : "\n\t" " .quad 0x03C4C50201C6C700"
1440 : "\n\t" " .quad 0xF83F3EF9FA3D3CFB"
1441 : "\n\t" " .quad 0xEE1921D638CFF700"
1442 : "\n\t" " .quad 0xA5526A9D7384BC4B"
1443 :
1444 : "\n\t" " # decryption key schedule: invskew x*B -> invskew x*E + 0x63"
1445 : "\n\t" " .Lk_dks_4 = .-.Laes_consts"
1446 : "\n\t" " .quad 0xE3C390B053732000"
1447 : "\n\t" " .quad 0xA080D3F310306343"
1448 : "\n\t" " .quad 0xA0CA214B036982E8"
1449 : "\n\t" " .quad 0x2F45AEC48CE60D67"
1450 :
1451 : "\n\t" "##"
1452 : "\n\t" "## Decryption stuff"
1453 : "\n\t" "## Round function constants"
1454 : "\n\t" "##"
1455 : "\n\t" " # decryption input transform"
1456 : "\n\t" " .Lk_dipt = .-.Laes_consts"
1457 : "\n\t" " .quad 0x0F505B040B545F00"
1458 : "\n\t" " .quad 0x154A411E114E451A"
1459 : "\n\t" " .quad 0x86E383E660056500"
1460 : "\n\t" " .quad 0x12771772F491F194"
1461 :
1462 : "\n\t" " # decryption sbox output *9*u, *9*t"
1463 : "\n\t" " .Lk_dsb9 = .-.Laes_consts"
1464 : "\n\t" " .quad 0x851C03539A86D600"
1465 : "\n\t" " .quad 0xCAD51F504F994CC9"
1466 : "\n\t" " .quad 0xC03B1789ECD74900"
1467 : "\n\t" " .quad 0x725E2C9EB2FBA565"
1468 :
1469 : "\n\t" " # decryption sbox output *D*u, *D*t"
1470 : "\n\t" " .Lk_dsbd = .-.Laes_consts"
1471 : "\n\t" " .quad 0x7D57CCDFE6B1A200"
1472 : "\n\t" " .quad 0xF56E9B13882A4439"
1473 : "\n\t" " .quad 0x3CE2FAF724C6CB00"
1474 : "\n\t" " .quad 0x2931180D15DEEFD3"
1475 :
1476 : "\n\t" " # decryption sbox output *B*u, *B*t"
1477 : "\n\t" " .Lk_dsbb = .-.Laes_consts"
1478 : "\n\t" " .quad 0xD022649296B44200"
1479 : "\n\t" " .quad 0x602646F6B0F2D404"
1480 : "\n\t" " .quad 0xC19498A6CD596700"
1481 : "\n\t" " .quad 0xF3FF0C3E3255AA6B"
1482 :
1483 : "\n\t" " # decryption sbox output *E*u, *E*t"
1484 : "\n\t" " .Lk_dsbe = .-.Laes_consts"
1485 : "\n\t" " .quad 0x46F2929626D4D000"
1486 : "\n\t" " .quad 0x2242600464B4F6B0"
1487 : "\n\t" " .quad 0x0C55A6CDFFAAC100"
1488 : "\n\t" " .quad 0x9467F36B98593E32"
1489 :
1490 : "\n\t" " # decryption sbox final output"
1491 : "\n\t" " .Lk_dsbo = .-.Laes_consts"
1492 : "\n\t" " .quad 0x1387EA537EF94000"
1493 : "\n\t" " .quad 0xC7AA6DB9D4943E2D"
1494 : "\n\t" " .quad 0x12D7560F93441D00"
1495 : "\n\t" " .quad 0xCA4B8159D8C58E9C"
1496 : X("\n\t" ".size _aes_consts,.-_aes_consts")
1497 : );
1498 :
1499 : #endif /* USE_SSSE3 */
|