LCOV - code coverage report
Current view: top level - cipher - rijndael-ssse3-amd64.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 163 0.0 %
Date: 2016-12-15 12:59:22 Functions: 0 15 0.0 %

          Line data    Source code
       1             : /* SSSE3 vector permutation AES for Libgcrypt
       2             :  * Copyright (C) 2014-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
       3             :  *
       4             :  * This file is part of Libgcrypt.
       5             :  *
       6             :  * Libgcrypt is free software; you can redistribute it and/or modify
       7             :  * it under the terms of the GNU Lesser General Public License as
       8             :  * published by the Free Software Foundation; either version 2.1 of
       9             :  * the License, or (at your option) any later version.
      10             :  *
      11             :  * Libgcrypt is distributed in the hope that it will be useful,
      12             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14             :  * GNU Lesser General Public License for more details.
      15             :  *
      16             :  * You should have received a copy of the GNU Lesser General Public
      17             :  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
      18             :  *
      19             :  *
      20             :  * The code is based on the public domain library libvpaes version 0.5
      21             :  * available at http://crypto.stanford.edu/vpaes/ and which carries
      22             :  * this notice:
      23             :  *
      24             :  *     libvpaes: constant-time SSSE3 AES encryption and decryption.
      25             :  *     version 0.5
      26             :  *
      27             :  *     By Mike Hamburg, Stanford University, 2009.  Public domain.
      28             :  *     I wrote essentially all of this code.  I did not write the test
      29             :  *     vectors; they are the NIST known answer tests.  I hereby release all
      30             :  *     the code and documentation here that I wrote into the public domain.
      31             :  *
      32             :  *     This is an implementation of AES following my paper,
      33             :  *       "Accelerating AES with Vector Permute Instructions"
      34             :  *       CHES 2009; http://shiftleft.org/papers/vector_aes/
      35             :  */
      36             : 
      37             : #include <config.h>
      38             : #include <stdio.h>
      39             : #include <stdlib.h>
      40             : #include <string.h> /* for memcmp() */
      41             : 
      42             : #include "types.h"  /* for byte and u32 typedefs */
      43             : #include "g10lib.h"
      44             : #include "cipher.h"
      45             : #include "bufhelp.h"
      46             : #include "cipher-selftest.h"
      47             : #include "rijndael-internal.h"
      48             : #include "./cipher-internal.h"
      49             : 
      50             : 
      51             : #ifdef USE_SSSE3
      52             : 
      53             : 
      54             : #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
      55             : /* Prevent compiler from issuing SSE instructions between asm blocks. */
      56             : #  pragma GCC target("no-sse")
      57             : #endif
      58             : 
      59             : 
      60             : /* Two macros to be called prior and after the use of SSSE3
      61             :   instructions.  There should be no external function calls between
      62             :   the use of these macros.  There purpose is to make sure that the
      63             :   SSE registers are cleared and won't reveal any information about
      64             :   the key or the data.  */
      65             : #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
      66             : # define SSSE3_STATE_SIZE (16 * 10)
      67             : /* XMM6-XMM15 are callee-saved registers on WIN64. */
      68             : # define vpaes_ssse3_prepare() \
      69             :     asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t" \
      70             :                   "movdqu %%xmm7,  1*16(%0)\n\t" \
      71             :                   "movdqu %%xmm8,  2*16(%0)\n\t" \
      72             :                   "movdqu %%xmm9,  3*16(%0)\n\t" \
      73             :                   "movdqu %%xmm10, 4*16(%0)\n\t" \
      74             :                   "movdqu %%xmm11, 5*16(%0)\n\t" \
      75             :                   "movdqu %%xmm12, 6*16(%0)\n\t" \
      76             :                   "movdqu %%xmm13, 7*16(%0)\n\t" \
      77             :                   "movdqu %%xmm14, 8*16(%0)\n\t" \
      78             :                   "movdqu %%xmm15, 9*16(%0)\n\t" \
      79             :                   : \
      80             :                   : "r" (ssse3_state) \
      81             :                   : "memory" )
      82             : # define vpaes_ssse3_cleanup() \
      83             :     asm volatile ("pxor    %%xmm0,  %%xmm0 \n\t" \
      84             :                   "pxor    %%xmm1,  %%xmm1 \n\t" \
      85             :                   "pxor    %%xmm2,  %%xmm2 \n\t" \
      86             :                   "pxor    %%xmm3,  %%xmm3 \n\t" \
      87             :                   "pxor    %%xmm4,  %%xmm4 \n\t" \
      88             :                   "pxor    %%xmm5,  %%xmm5 \n\t" \
      89             :                   "movdqu 0*16(%0), %%xmm6 \n\t" \
      90             :                   "movdqu 1*16(%0), %%xmm7 \n\t" \
      91             :                   "movdqu 2*16(%0), %%xmm8 \n\t" \
      92             :                   "movdqu 3*16(%0), %%xmm9 \n\t" \
      93             :                   "movdqu 4*16(%0), %%xmm10 \n\t" \
      94             :                   "movdqu 5*16(%0), %%xmm11 \n\t" \
      95             :                   "movdqu 6*16(%0), %%xmm12 \n\t" \
      96             :                   "movdqu 7*16(%0), %%xmm13 \n\t" \
      97             :                   "movdqu 8*16(%0), %%xmm14 \n\t" \
      98             :                   "movdqu 9*16(%0), %%xmm15 \n\t" \
      99             :                   : \
     100             :                   : "r" (ssse3_state) \
     101             :                   : "memory" )
     102             : #else
     103             : # define SSSE3_STATE_SIZE 1
     104             : # define vpaes_ssse3_prepare() (void)ssse3_state
     105             : # define vpaes_ssse3_cleanup() \
     106             :     asm volatile ("pxor    %%xmm0,  %%xmm0 \n\t" \
     107             :                   "pxor    %%xmm1,  %%xmm1 \n\t" \
     108             :                   "pxor    %%xmm2,  %%xmm2 \n\t" \
     109             :                   "pxor    %%xmm3,  %%xmm3 \n\t" \
     110             :                   "pxor    %%xmm4,  %%xmm4 \n\t" \
     111             :                   "pxor    %%xmm5,  %%xmm5 \n\t" \
     112             :                   "pxor    %%xmm6,  %%xmm6 \n\t" \
     113             :                   "pxor    %%xmm7,  %%xmm7 \n\t" \
     114             :                   "pxor    %%xmm8,  %%xmm8 \n\t" \
     115             :                   ::: "memory" )
     116             : #endif
     117             : 
     118             : #define vpaes_ssse3_prepare_enc(const_ptr) \
     119             :     vpaes_ssse3_prepare(); \
     120             :     asm volatile ("lea     .Laes_consts(%%rip), %q0 \n\t" \
     121             :                   "movdqa            (%q0), %%xmm9  # 0F \n\t" \
     122             :                   "movdqa  .Lk_inv   (%q0), %%xmm10 # inv \n\t" \
     123             :                   "movdqa  .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
     124             :                   "movdqa  .Lk_sb1   (%q0), %%xmm13 # sb1u \n\t" \
     125             :                   "movdqa  .Lk_sb1+16(%q0), %%xmm12 # sb1t \n\t" \
     126             :                   "movdqa  .Lk_sb2   (%q0), %%xmm15 # sb2u \n\t" \
     127             :                   "movdqa  .Lk_sb2+16(%q0), %%xmm14 # sb2t \n\t" \
     128             :                   : "=c" (const_ptr) \
     129             :                   : \
     130             :                   : "memory" )
     131             : 
     132             : #define vpaes_ssse3_prepare_dec(const_ptr) \
     133             :     vpaes_ssse3_prepare(); \
     134             :     asm volatile ("lea     .Laes_consts(%%rip), %q0 \n\t" \
     135             :                   "movdqa            (%q0), %%xmm9  # 0F \n\t" \
     136             :                   "movdqa  .Lk_inv   (%q0), %%xmm10 # inv \n\t" \
     137             :                   "movdqa  .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
     138             :                   "movdqa  .Lk_dsb9   (%q0), %%xmm13 # sb9u \n\t" \
     139             :                   "movdqa  .Lk_dsb9+16(%q0), %%xmm12 # sb9t \n\t" \
     140             :                   "movdqa  .Lk_dsbd   (%q0), %%xmm15 # sbdu \n\t" \
     141             :                   "movdqa  .Lk_dsbb   (%q0), %%xmm14 # sbbu \n\t" \
     142             :                   "movdqa  .Lk_dsbe   (%q0), %%xmm8 # sbeu \n\t" \
     143             :                   : "=c" (const_ptr) \
     144             :                   : \
     145             :                   : "memory" )
     146             : 
     147             : 
     148             : 
     149             : void
     150           0 : _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
     151             : {
     152           0 :   unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
     153             :   byte ssse3_state[SSSE3_STATE_SIZE];
     154             : 
     155             :   vpaes_ssse3_prepare();
     156             : 
     157           0 :   asm volatile ("leaq %q[key], %%rdi"                 "\n\t"
     158             :                 "movl %[bits], %%esi"                 "\n\t"
     159             :                 "leaq %[buf], %%rdx"                  "\n\t"
     160             :                 "movl %[dir], %%ecx"                  "\n\t"
     161             :                 "movl %[rotoffs], %%r8d"              "\n\t"
     162             :                 "call _aes_schedule_core"             "\n\t"
     163             :                 :
     164             :                 : [key] "m" (*key),
     165             :                   [bits] "g" (keybits),
     166             :                   [buf] "m" (ctx->keyschenc32[0][0]),
     167             :                   [dir] "g" (0),
     168             :                   [rotoffs] "g" (48)
     169             :                 : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
     170             :                   "cc", "memory");
     171             : 
     172           0 :   vpaes_ssse3_cleanup();
     173             : 
     174             :   /* Save key for setting up decryption. */
     175           0 :   memcpy(&ctx->keyschdec32[0][0], key, keybits / 8);
     176           0 : }
     177             : 
     178             : 
     179             : /* Make a decryption key from an encryption key. */
     180             : void
     181           0 : _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
     182             : {
     183           0 :   unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
     184             :   byte ssse3_state[SSSE3_STATE_SIZE];
     185             : 
     186             :   vpaes_ssse3_prepare();
     187             : 
     188           0 :   asm volatile ("leaq %q[key], %%rdi"                 "\n\t"
     189             :                 "movl %[bits], %%esi"                 "\n\t"
     190             :                 "leaq %[buf], %%rdx"                  "\n\t"
     191             :                 "movl %[dir], %%ecx"                  "\n\t"
     192             :                 "movl %[rotoffs], %%r8d"              "\n\t"
     193             :                 "call _aes_schedule_core"             "\n\t"
     194             :                 :
     195             :                 : [key] "m" (ctx->keyschdec32[0][0]),
     196             :                   [bits] "g" (keybits),
     197           0 :                   [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
     198             :                   [dir] "g" (1),
     199           0 :                   [rotoffs] "g" ((keybits == 192) ? 0 : 32)
     200             :                 : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
     201             :                   "cc", "memory");
     202             : 
     203           0 :   vpaes_ssse3_cleanup();
     204           0 : }
     205             : 
     206             : 
     207             : /* Encrypt one block using the Intel SSSE3 instructions.  Block is input
     208             : * and output through SSE register xmm0. */
     209             : static inline void
     210           0 : do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds,
     211             :                     const void *aes_const_ptr)
     212             : {
     213           0 :   unsigned int middle_rounds = nrounds - 1;
     214           0 :   const void *keysched = ctx->keyschenc32;
     215             : 
     216           0 :   asm volatile ("call _aes_encrypt_core"              "\n\t"
     217             :                 : "+a" (middle_rounds), "+d" (keysched)
     218             :                 : "c" (aes_const_ptr)
     219             :                 : "rdi", "rsi", "cc", "memory");
     220           0 : }
     221             : 
     222             : 
     223             : /* Decrypt one block using the Intel SSSE3 instructions.  Block is input
     224             : * and output through SSE register xmm0. */
     225             : static inline void
     226           0 : do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds,
     227             :                     const void *aes_const_ptr)
     228             : {
     229           0 :   unsigned int middle_rounds = nrounds - 1;
     230           0 :   const void *keysched = ctx->keyschdec32;
     231             : 
     232           0 :   asm volatile ("call _aes_decrypt_core"              "\n\t"
     233             :                 : "+a" (middle_rounds), "+d" (keysched)
     234             :                 : "c" (aes_const_ptr)
     235             :                 : "rsi", "cc", "memory");
     236           0 : }
     237             : 
     238             : 
     239             : unsigned int
     240           0 : _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
     241             :                         const unsigned char *src)
     242             : {
     243           0 :   unsigned int nrounds = ctx->rounds;
     244             :   const void *aes_const_ptr;
     245             :   byte ssse3_state[SSSE3_STATE_SIZE];
     246             : 
     247           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     248           0 :   asm volatile ("movdqu %[src], %%xmm0\n\t"
     249             :                 :
     250             :                 : [src] "m" (*src)
     251             :                 : "memory" );
     252           0 :   do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     253           0 :   asm volatile ("movdqu %%xmm0, %[dst]\n\t"
     254             :                 : [dst] "=m" (*dst)
     255             :                 :
     256             :                 : "memory" );
     257           0 :   vpaes_ssse3_cleanup ();
     258           0 :   return 0;
     259             : }
     260             : 
     261             : 
     262             : void
     263           0 : _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
     264             :                         const unsigned char *inbuf, unsigned char *iv,
     265             :                         size_t nblocks)
     266             : {
     267           0 :   unsigned int nrounds = ctx->rounds;
     268             :   const void *aes_const_ptr;
     269             :   byte ssse3_state[SSSE3_STATE_SIZE];
     270             : 
     271           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     272             : 
     273           0 :   asm volatile ("movdqu %[iv], %%xmm0\n\t"
     274             :                 : /* No output */
     275             :                 : [iv] "m" (*iv)
     276             :                 : "memory" );
     277             : 
     278           0 :   for ( ;nblocks; nblocks-- )
     279             :     {
     280           0 :       do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     281             : 
     282           0 :       asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
     283             :                     "pxor %%xmm1, %%xmm0\n\t"
     284             :                     "movdqu %%xmm0, %[outbuf]\n\t"
     285             :                     : [outbuf] "=m" (*outbuf)
     286             :                     : [inbuf] "m" (*inbuf)
     287             :                     : "memory" );
     288             : 
     289           0 :       outbuf += BLOCKSIZE;
     290           0 :       inbuf  += BLOCKSIZE;
     291             :     }
     292             : 
     293           0 :   asm volatile ("movdqu %%xmm0, %[iv]\n\t"
     294             :                 : [iv] "=m" (*iv)
     295             :                 :
     296             :                 : "memory" );
     297             : 
     298           0 :   vpaes_ssse3_cleanup ();
     299           0 : }
     300             : 
     301             : 
     302             : void
     303           0 : _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
     304             :                         const unsigned char *inbuf, unsigned char *iv,
     305             :                         size_t nblocks, int cbc_mac)
     306             : {
     307           0 :   unsigned int nrounds = ctx->rounds;
     308             :   const void *aes_const_ptr;
     309             :   byte ssse3_state[SSSE3_STATE_SIZE];
     310             : 
     311           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     312             : 
     313           0 :   asm volatile ("movdqu %[iv], %%xmm7\n\t"
     314             :                 : /* No output */
     315             :                 : [iv] "m" (*iv)
     316             :                 : "memory" );
     317             : 
     318           0 :   for ( ;nblocks; nblocks-- )
     319             :     {
     320           0 :       asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
     321             :                     "pxor %%xmm7, %%xmm0\n\t"
     322             :                     : /* No output */
     323             :                     : [inbuf] "m" (*inbuf)
     324             :                     : "memory" );
     325             : 
     326           0 :       do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     327             : 
     328           0 :       asm volatile ("movdqa %%xmm0, %%xmm7\n\t"
     329             :                     "movdqu %%xmm0, %[outbuf]\n\t"
     330             :                     : [outbuf] "=m" (*outbuf)
     331             :                     :
     332             :                     : "memory" );
     333             : 
     334           0 :       inbuf += BLOCKSIZE;
     335           0 :       if (!cbc_mac)
     336           0 :         outbuf += BLOCKSIZE;
     337             :     }
     338             : 
     339           0 :   asm volatile ("movdqu %%xmm7, %[iv]\n\t"
     340             :                 : [iv] "=m" (*iv)
     341             :                 :
     342             :                 : "memory" );
     343             : 
     344           0 :   vpaes_ssse3_cleanup ();
     345           0 : }
     346             : 
     347             : 
     348             : void
     349           0 : _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
     350             :                         const unsigned char *inbuf, unsigned char *ctr,
     351             :                         size_t nblocks)
     352             : {
     353             :   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     354             :     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
     355           0 :   unsigned int nrounds = ctx->rounds;
     356             :   const void *aes_const_ptr;
     357             :   byte ssse3_state[SSSE3_STATE_SIZE];
     358             :   u64 ctrlow;
     359             : 
     360           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     361             : 
     362           0 :   asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
     363             :                 "movdqa (%[ctr]), %%xmm7\n\t"  /* Preload CTR */
     364             :                 "movq 8(%[ctr]), %q[ctrlow]\n\t"
     365             :                 "bswapq %q[ctrlow]\n\t"
     366             :                 : [ctrlow] "=r" (ctrlow)
     367             :                 : [mask] "m" (*be_mask),
     368             :                   [ctr] "r" (ctr)
     369             :                 : "memory", "cc");
     370             : 
     371           0 :   for ( ;nblocks; nblocks-- )
     372             :     {
     373           0 :       asm volatile ("movdqa %%xmm7, %%xmm0\n\t"     /* xmm0 := CTR (xmm7)  */
     374             :                     "pcmpeqd %%xmm1, %%xmm1\n\t"
     375             :                     "psrldq $8, %%xmm1\n\t"         /* xmm1 = -1 */
     376             : 
     377             :                     "pshufb %%xmm6, %%xmm7\n\t"
     378             :                     "psubq  %%xmm1, %%xmm7\n\t"     /* xmm7++ (big endian) */
     379             : 
     380             :                     /* detect if 64-bit carry handling is needed */
     381             :                     "incq   %q[ctrlow]\n\t"
     382             :                     "jnz    .Lno_carry%=\n\t"
     383             : 
     384             :                     "pslldq $8, %%xmm1\n\t"         /* move lower 64-bit to high */
     385             :                     "psubq   %%xmm1, %%xmm7\n\t"    /* add carry to upper 64bits */
     386             : 
     387             :                     ".Lno_carry%=:\n\t"
     388             : 
     389             :                     "pshufb %%xmm6, %%xmm7\n\t"
     390             :                     :
     391             :                     : [ctr] "r" (ctr), [ctrlow] "r" (ctrlow)
     392             :                     : "cc", "memory");
     393             : 
     394           0 :       do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     395             : 
     396           0 :       asm volatile ("movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
     397             :                     "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
     398             :                     "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
     399             :                     : [dst] "=m" (*outbuf)
     400             :                     : [src] "m" (*inbuf)
     401             :                     : "memory");
     402             : 
     403           0 :       outbuf += BLOCKSIZE;
     404           0 :       inbuf  += BLOCKSIZE;
     405             :     }
     406             : 
     407           0 :   asm volatile ("movdqu %%xmm7, %[ctr]\n\t"   /* Update CTR (mem).       */
     408             :                 : [ctr] "=m" (*ctr)
     409             :                 :
     410             :                 : "memory" );
     411             : 
     412           0 :   vpaes_ssse3_cleanup ();
     413           0 : }
     414             : 
     415             : 
     416             : unsigned int
     417           0 : _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
     418             :                         const unsigned char *src)
     419             : {
     420           0 :   unsigned int nrounds = ctx->rounds;
     421             :   const void *aes_const_ptr;
     422             :   byte ssse3_state[SSSE3_STATE_SIZE];
     423             : 
     424           0 :   vpaes_ssse3_prepare_dec (aes_const_ptr);
     425           0 :   asm volatile ("movdqu %[src], %%xmm0\n\t"
     426             :                 :
     427             :                 : [src] "m" (*src)
     428             :                 : "memory" );
     429           0 :   do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
     430           0 :   asm volatile ("movdqu %%xmm0, %[dst]\n\t"
     431             :                 : [dst] "=m" (*dst)
     432             :                 :
     433             :                 : "memory" );
     434           0 :   vpaes_ssse3_cleanup ();
     435           0 :   return 0;
     436             : }
     437             : 
     438             : 
     439             : void
     440           0 : _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
     441             :                         const unsigned char *inbuf, unsigned char *iv,
     442             :                         size_t nblocks)
     443             : {
     444           0 :   unsigned int nrounds = ctx->rounds;
     445             :   const void *aes_const_ptr;
     446             :   byte ssse3_state[SSSE3_STATE_SIZE];
     447             : 
     448           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     449             : 
     450           0 :   asm volatile ("movdqu %[iv], %%xmm0\n\t"
     451             :                 : /* No output */
     452             :                 : [iv] "m" (*iv)
     453             :                 : "memory" );
     454             : 
     455           0 :   for ( ;nblocks; nblocks-- )
     456             :     {
     457           0 :       do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     458             : 
     459           0 :       asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
     460             :                     "movdqu %[inbuf], %%xmm0\n\t"
     461             :                     "pxor %%xmm0, %%xmm6\n\t"
     462             :                     "movdqu %%xmm6, %[outbuf]\n\t"
     463             :                     : [outbuf] "=m" (*outbuf)
     464             :                     : [inbuf] "m" (*inbuf)
     465             :                     : "memory" );
     466             : 
     467           0 :       outbuf += BLOCKSIZE;
     468           0 :       inbuf  += BLOCKSIZE;
     469             :     }
     470             : 
     471           0 :   asm volatile ("movdqu %%xmm0, %[iv]\n\t"
     472             :                 : [iv] "=m" (*iv)
     473             :                 :
     474             :                 : "memory" );
     475             : 
     476           0 :   vpaes_ssse3_cleanup ();
     477           0 : }
     478             : 
     479             : 
     480             : void
     481           0 : _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
     482             :                         const unsigned char *inbuf, unsigned char *iv,
     483             :                         size_t nblocks)
     484             : {
     485           0 :   unsigned int nrounds = ctx->rounds;
     486             :   const void *aes_const_ptr;
     487             :   byte ssse3_state[SSSE3_STATE_SIZE];
     488             : 
     489           0 :   vpaes_ssse3_prepare_dec (aes_const_ptr);
     490             : 
     491           0 :   asm volatile
     492             :     ("movdqu %[iv], %%xmm7\n\t"       /* use xmm7 as fast IV storage */
     493             :     : /* No output */
     494             :     : [iv] "m" (*iv)
     495             :     : "memory");
     496             : 
     497           0 :   for ( ;nblocks; nblocks-- )
     498             :     {
     499           0 :       asm volatile
     500             :         ("movdqu %[inbuf], %%xmm0\n\t"
     501             :         "movdqa %%xmm0, %%xmm6\n\t"    /* use xmm6 as savebuf */
     502             :         : /* No output */
     503             :         : [inbuf] "m" (*inbuf)
     504             :         : "memory");
     505             : 
     506           0 :       do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
     507             : 
     508           0 :       asm volatile
     509             :         ("pxor %%xmm7, %%xmm0\n\t"    /* xor IV with output */
     510             :         "movdqu %%xmm0, %[outbuf]\n\t"
     511             :         "movdqu %%xmm6, %%xmm7\n\t"   /* store savebuf as new IV */
     512             :         : [outbuf] "=m" (*outbuf)
     513             :         :
     514             :         : "memory");
     515             : 
     516           0 :       outbuf += BLOCKSIZE;
     517           0 :       inbuf  += BLOCKSIZE;
     518             :     }
     519             : 
     520           0 :   asm volatile
     521             :     ("movdqu %%xmm7, %[iv]\n\t"       /* store IV */
     522             :     : /* No output */
     523             :     : [iv] "m" (*iv)
     524             :     : "memory");
     525             : 
     526           0 :   vpaes_ssse3_cleanup ();
     527           0 : }
     528             : 
     529             : 
     530             : static void
     531           0 : ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
     532             :                const void *inbuf_arg, size_t nblocks)
     533             : {
     534           0 :   RIJNDAEL_context *ctx = (void *)&c->context.c;
     535           0 :   unsigned char *outbuf = outbuf_arg;
     536           0 :   const unsigned char *inbuf = inbuf_arg;
     537           0 :   u64 n = c->u_mode.ocb.data_nblocks;
     538           0 :   unsigned int nrounds = ctx->rounds;
     539             :   const void *aes_const_ptr;
     540             :   byte ssse3_state[SSSE3_STATE_SIZE];
     541             : 
     542           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     543             : 
     544             :   /* Preload Offset and Checksum */
     545           0 :   asm volatile ("movdqu %[iv], %%xmm7\n\t"
     546             :                 "movdqu %[ctr], %%xmm6\n\t"
     547             :                 : /* No output */
     548             :                 : [iv] "m" (*c->u_iv.iv),
     549             :                   [ctr] "m" (*c->u_ctr.ctr)
     550             :                 : "memory" );
     551             : 
     552           0 :   for ( ;nblocks; nblocks-- )
     553             :     {
     554             :       const unsigned char *l;
     555             : 
     556           0 :       l = ocb_get_l(c, ++n);
     557             : 
     558             :       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
     559             :       /* Checksum_i = Checksum_{i-1} xor P_i  */
     560             :       /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
     561           0 :       asm volatile ("movdqu %[l],     %%xmm1\n\t"
     562             :                     "movdqu %[inbuf], %%xmm0\n\t"
     563             :                     "pxor   %%xmm1,   %%xmm7\n\t"
     564             :                     "pxor   %%xmm0,   %%xmm6\n\t"
     565             :                     "pxor   %%xmm7,   %%xmm0\n\t"
     566             :                     :
     567             :                     : [l] "m" (*l),
     568             :                       [inbuf] "m" (*inbuf)
     569             :                     : "memory" );
     570             : 
     571           0 :       do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     572             : 
     573           0 :       asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
     574             :                     "movdqu %%xmm0, %[outbuf]\n\t"
     575             :                     : [outbuf] "=m" (*outbuf)
     576             :                     :
     577             :                     : "memory" );
     578             : 
     579           0 :       inbuf += BLOCKSIZE;
     580           0 :       outbuf += BLOCKSIZE;
     581             :     }
     582             : 
     583           0 :   c->u_mode.ocb.data_nblocks = n;
     584           0 :   asm volatile ("movdqu %%xmm7, %[iv]\n\t"
     585             :                 "movdqu %%xmm6, %[ctr]\n\t"
     586             :                 : [iv] "=m" (*c->u_iv.iv),
     587             :                   [ctr] "=m" (*c->u_ctr.ctr)
     588             :                 :
     589             :                 : "memory" );
     590             : 
     591           0 :   vpaes_ssse3_cleanup ();
     592           0 : }
     593             : 
     594             : static void
     595           0 : ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
     596             :                const void *inbuf_arg, size_t nblocks)
     597             : {
     598           0 :   RIJNDAEL_context *ctx = (void *)&c->context.c;
     599           0 :   unsigned char *outbuf = outbuf_arg;
     600           0 :   const unsigned char *inbuf = inbuf_arg;
     601           0 :   u64 n = c->u_mode.ocb.data_nblocks;
     602           0 :   unsigned int nrounds = ctx->rounds;
     603             :   const void *aes_const_ptr;
     604             :   byte ssse3_state[SSSE3_STATE_SIZE];
     605             : 
     606           0 :   vpaes_ssse3_prepare_dec (aes_const_ptr);
     607             : 
     608             :   /* Preload Offset and Checksum */
     609           0 :   asm volatile ("movdqu %[iv], %%xmm7\n\t"
     610             :                 "movdqu %[ctr], %%xmm6\n\t"
     611             :                 : /* No output */
     612             :                 : [iv] "m" (*c->u_iv.iv),
     613             :                   [ctr] "m" (*c->u_ctr.ctr)
     614             :                 : "memory" );
     615             : 
     616           0 :   for ( ;nblocks; nblocks-- )
     617             :     {
     618             :       const unsigned char *l;
     619             : 
     620           0 :       l = ocb_get_l(c, ++n);
     621             : 
     622             :       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
     623             :       /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
     624             :       /* Checksum_i = Checksum_{i-1} xor P_i  */
     625           0 :       asm volatile ("movdqu %[l],     %%xmm1\n\t"
     626             :                     "movdqu %[inbuf], %%xmm0\n\t"
     627             :                     "pxor   %%xmm1,   %%xmm7\n\t"
     628             :                     "pxor   %%xmm7,   %%xmm0\n\t"
     629             :                     :
     630             :                     : [l] "m" (*l),
     631             :                       [inbuf] "m" (*inbuf)
     632             :                     : "memory" );
     633             : 
     634           0 :       do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
     635             : 
     636           0 :       asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
     637             :                     "pxor   %%xmm0, %%xmm6\n\t"
     638             :                     "movdqu %%xmm0, %[outbuf]\n\t"
     639             :                     : [outbuf] "=m" (*outbuf)
     640             :                     :
     641             :                     : "memory" );
     642             : 
     643           0 :       inbuf += BLOCKSIZE;
     644           0 :       outbuf += BLOCKSIZE;
     645             :     }
     646             : 
     647           0 :   c->u_mode.ocb.data_nblocks = n;
     648           0 :   asm volatile ("movdqu %%xmm7, %[iv]\n\t"
     649             :                 "movdqu %%xmm6, %[ctr]\n\t"
     650             :                 : [iv] "=m" (*c->u_iv.iv),
     651             :                   [ctr] "=m" (*c->u_ctr.ctr)
     652             :                 :
     653             :                 : "memory" );
     654             : 
     655           0 :   vpaes_ssse3_cleanup ();
     656           0 : }
     657             : 
     658             : 
     659             : void
     660           0 : _gcry_aes_ssse3_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
     661             :                           const void *inbuf_arg, size_t nblocks, int encrypt)
     662             : {
     663           0 :   if (encrypt)
     664           0 :     ssse3_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
     665             :   else
     666           0 :     ssse3_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
     667           0 : }
     668             : 
     669             : 
     670             : void
     671           0 : _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     672             :                           size_t nblocks)
     673             : {
     674           0 :   RIJNDAEL_context *ctx = (void *)&c->context.c;
     675           0 :   const unsigned char *abuf = abuf_arg;
     676           0 :   u64 n = c->u_mode.ocb.aad_nblocks;
     677           0 :   unsigned int nrounds = ctx->rounds;
     678             :   const void *aes_const_ptr;
     679             :   byte ssse3_state[SSSE3_STATE_SIZE];
     680             : 
     681           0 :   vpaes_ssse3_prepare_enc (aes_const_ptr);
     682             : 
     683             :   /* Preload Offset and Sum */
     684           0 :   asm volatile ("movdqu %[iv], %%xmm7\n\t"
     685             :                 "movdqu %[ctr], %%xmm6\n\t"
     686             :                 : /* No output */
     687             :                 : [iv] "m" (*c->u_mode.ocb.aad_offset),
     688             :                   [ctr] "m" (*c->u_mode.ocb.aad_sum)
     689             :                 : "memory" );
     690             : 
     691           0 :   for ( ;nblocks; nblocks-- )
     692             :     {
     693             :       const unsigned char *l;
     694             : 
     695           0 :       l = ocb_get_l(c, ++n);
     696             : 
     697             :       /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
     698             :       /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
     699           0 :       asm volatile ("movdqu %[l],     %%xmm1\n\t"
     700             :                     "movdqu %[abuf],  %%xmm0\n\t"
     701             :                     "pxor   %%xmm1,   %%xmm7\n\t"
     702             :                     "pxor   %%xmm7,   %%xmm0\n\t"
     703             :                     :
     704             :                     : [l] "m" (*l),
     705             :                       [abuf] "m" (*abuf)
     706             :                     : "memory" );
     707             : 
     708           0 :       do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
     709             : 
     710           0 :       asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
     711             :                     :
     712             :                     :
     713             :                     : "memory" );
     714             : 
     715           0 :       abuf += BLOCKSIZE;
     716             :     }
     717             : 
     718           0 :   c->u_mode.ocb.aad_nblocks = n;
     719           0 :   asm volatile ("movdqu %%xmm7, %[iv]\n\t"
     720             :                 "movdqu %%xmm6, %[ctr]\n\t"
     721             :                 : [iv] "=m" (*c->u_mode.ocb.aad_offset),
     722             :                   [ctr] "=m" (*c->u_mode.ocb.aad_sum)
     723             :                 :
     724             :                 : "memory" );
     725             : 
     726           0 :   vpaes_ssse3_cleanup ();
     727           0 : }
     728             : 
     729             : 
     730             : #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
     731             : # define X(...)
     732             : #else
     733             : # define X(...) __VA_ARGS__
     734             : #endif
     735             : 
     736             : asm (
     737             :   "\n\t" "##"
     738             :   "\n\t" "## Constant-time SSSE3 AES core implementation."
     739             :   "\n\t" "##"
     740             :   "\n\t" "## By Mike Hamburg (Stanford University), 2009"
     741             :   "\n\t" "## Public domain."
     742             :   "\n\t" "##"
     743             : 
     744             :   "\n\t" ".text"
     745             : 
     746             :   "\n\t" "##"
     747             :   "\n\t" "##  _aes_encrypt_core"
     748             :   "\n\t" "##"
     749             :   "\n\t" "##  AES-encrypt %xmm0."
     750             :   "\n\t" "##"
     751             :   "\n\t" "##  Inputs:"
     752             :   "\n\t" "##     %xmm0 = input"
     753             :   "\n\t" "##     %xmm9-%xmm15 as in .Laes_preheat"
     754             :   "\n\t" "##     %rcx  = .Laes_consts"
     755             :   "\n\t" "##    (%rdx) = scheduled keys"
     756             :   "\n\t" "##     %rax  = nrounds - 1"
     757             :   "\n\t" "##"
     758             :   "\n\t" "##  Output in %xmm0"
     759             :   "\n\t" "##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax"
     760             :   "\n\t" "##  Preserves %xmm6 - %xmm7 so you get some local vectors"
     761             :   "\n\t" "##"
     762             :   "\n\t" "##"
     763             :   "\n\t" ".align 16"
     764             : X("\n\t" ".type _aes_encrypt_core,@function")
     765             :   "\n\t" "_aes_encrypt_core:"
     766             :   "\n\t" "       leaq    .Lk_mc_backward(%rcx), %rdi"
     767             :   "\n\t" "       mov     $16,    %rsi"
     768             :   "\n\t" "       movdqa  .Lk_ipt   (%rcx), %xmm2 # iptlo"
     769             :   "\n\t" "       movdqa  %xmm9,  %xmm1"
     770             :   "\n\t" "       pandn   %xmm0,  %xmm1"
     771             :   "\n\t" "       psrld   $4,     %xmm1"
     772             :   "\n\t" "       pand    %xmm9,  %xmm0"
     773             :   "\n\t" "       pshufb  %xmm0,  %xmm2"
     774             :   "\n\t" "       movdqa  .Lk_ipt+16(%rcx), %xmm0 # ipthi"
     775             :   "\n\t" "       pshufb  %xmm1,  %xmm0"
     776             :   "\n\t" "       pxor    (%rdx),%xmm2"
     777             :   "\n\t" "       pxor    %xmm2,  %xmm0"
     778             :   "\n\t" "       add     $16,    %rdx"
     779             :   "\n\t" "       jmp     .Laes_entry"
     780             : 
     781             :   "\n\t" ".align 8"
     782             :   "\n\t" ".Laes_loop:"
     783             :   "\n\t" "       # middle of middle round"
     784             :   "\n\t" "       movdqa  %xmm13, %xmm4   # 4 : sb1u"
     785             :   "\n\t" "       pshufb  %xmm2,  %xmm4   # 4 = sb1u"
     786             :   "\n\t" "       pxor    (%rdx), %xmm4   # 4 = sb1u + k"
     787             :   "\n\t" "       movdqa  %xmm12, %xmm0   # 0 : sb1t"
     788             :   "\n\t" "       pshufb  %xmm3,  %xmm0   # 0 = sb1t"
     789             :   "\n\t" "       pxor    %xmm4,  %xmm0   # 0 = A"
     790             :   "\n\t" "       movdqa  %xmm15, %xmm4   # 4 : sb2u"
     791             :   "\n\t" "       pshufb  %xmm2,  %xmm4   # 4 = sb2u"
     792             :   "\n\t" "       movdqa  .Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1"
     793             :   "\n\t" "       movdqa  %xmm14, %xmm2   # 2 : sb2t"
     794             :   "\n\t" "       pshufb  %xmm3,  %xmm2   # 2 = sb2t"
     795             :   "\n\t" "       pxor    %xmm4,  %xmm2   # 2 = 2A"
     796             :   "\n\t" "       movdqa  %xmm0,  %xmm3   # 3 = A"
     797             :   "\n\t" "       pshufb  %xmm1,  %xmm0   # 0 = B"
     798             :   "\n\t" "       pxor    %xmm2,  %xmm0   # 0 = 2A+B"
     799             :   "\n\t" "       pshufb  (%rsi,%rdi), %xmm3  # 3 = D"
     800             :   "\n\t" "       lea     16(%esi),%esi   # next mc"
     801             :   "\n\t" "       pxor    %xmm0,  %xmm3   # 3 = 2A+B+D"
     802             :   "\n\t" "       lea     16(%rdx),%rdx   # next key"
     803             :   "\n\t" "       pshufb  %xmm1,  %xmm0   # 0 = 2B+C"
     804             :   "\n\t" "       pxor    %xmm3,  %xmm0   # 0 = 2A+3B+C+D"
     805             :   "\n\t" "       and     $48, %rsi       # ... mod 4"
     806             :   "\n\t" "       dec     %rax            # nr--"
     807             : 
     808             :   "\n\t" ".Laes_entry:"
     809             :   "\n\t" "       # top of round"
     810             :   "\n\t" "       movdqa  %xmm9,  %xmm1   # 1 : i"
     811             :   "\n\t" "       pandn   %xmm0,  %xmm1   # 1 = i<<4"
     812             :   "\n\t" "       psrld   $4,     %xmm1   # 1 = i"
     813             :   "\n\t" "       pand    %xmm9,  %xmm0   # 0 = k"
     814             :   "\n\t" "       movdqa  %xmm11, %xmm2   # 2 : a/k"
     815             :   "\n\t" "       pshufb  %xmm0,  %xmm2   # 2 = a/k"
     816             :   "\n\t" "       pxor    %xmm1,  %xmm0   # 0 = j"
     817             :   "\n\t" "       movdqa  %xmm10, %xmm3   # 3 : 1/i"
     818             :   "\n\t" "       pshufb  %xmm1,  %xmm3   # 3 = 1/i"
     819             :   "\n\t" "       pxor    %xmm2,  %xmm3   # 3 = iak = 1/i + a/k"
     820             :   "\n\t" "       movdqa  %xmm10, %xmm4   # 4 : 1/j"
     821             :   "\n\t" "       pshufb  %xmm0,  %xmm4   # 4 = 1/j"
     822             :   "\n\t" "       pxor    %xmm2,  %xmm4   # 4 = jak = 1/j + a/k"
     823             :   "\n\t" "       movdqa  %xmm10, %xmm2   # 2 : 1/iak"
     824             :   "\n\t" "       pshufb  %xmm3,  %xmm2   # 2 = 1/iak"
     825             :   "\n\t" "       pxor    %xmm0,  %xmm2   # 2 = io"
     826             :   "\n\t" "       movdqa  %xmm10, %xmm3   # 3 : 1/jak"
     827             :   "\n\t" "       pshufb  %xmm4,  %xmm3   # 3 = 1/jak"
     828             :   "\n\t" "       pxor    %xmm1,  %xmm3   # 3 = jo"
     829             :   "\n\t" "       jnz     .Laes_loop"
     830             : 
     831             :   "\n\t" "       # middle of last round"
     832             :   "\n\t" "       movdqa  .Lk_sbo(%rcx), %xmm4    # 3 : sbou"
     833             :   "\n\t" "       pshufb  %xmm2,  %xmm4   # 4 = sbou"
     834             :   "\n\t" "       pxor    (%rdx), %xmm4   # 4 = sb1u + k"
     835             :   "\n\t" "       movdqa  .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot"
     836             :   "\n\t" "       pshufb  %xmm3,  %xmm0   # 0 = sb1t"
     837             :   "\n\t" "       pxor    %xmm4,  %xmm0   # 0 = A"
     838             :   "\n\t" "       pshufb  .Lk_sr(%rsi,%rcx), %xmm0"
     839             :   "\n\t" "       ret"
     840             : X("\n\t" ".size _aes_encrypt_core,.-_aes_encrypt_core")
     841             : 
     842             :   "\n\t" "##"
     843             :   "\n\t" "##  Decryption core"
     844             :   "\n\t" "##"
     845             :   "\n\t" "##  Same API as encryption core."
     846             :   "\n\t" "##"
     847             :   "\n\t" ".align 16"
     848             : X("\n\t" ".type _aes_decrypt_core,@function")
     849             :   "\n\t" "_aes_decrypt_core:"
     850             :   "\n\t" "       movl    %eax,   %esi"
     851             :   "\n\t" "       shll    $4,     %esi"
     852             :   "\n\t" "       xorl    $48,    %esi"
     853             :   "\n\t" "       andl    $48,    %esi"
     854             :   "\n\t" "       movdqa  .Lk_dipt   (%rcx), %xmm2 # iptlo"
     855             :   "\n\t" "       movdqa  %xmm9,  %xmm1"
     856             :   "\n\t" "       pandn   %xmm0,  %xmm1"
     857             :   "\n\t" "       psrld   $4,     %xmm1"
     858             :   "\n\t" "       pand    %xmm9,  %xmm0"
     859             :   "\n\t" "       pshufb  %xmm0,  %xmm2"
     860             :   "\n\t" "       movdqa  .Lk_dipt+16(%rcx), %xmm0 # ipthi"
     861             :   "\n\t" "       pshufb  %xmm1,  %xmm0"
     862             :   "\n\t" "       pxor    (%rdx), %xmm2"
     863             :   "\n\t" "       pxor    %xmm2,  %xmm0"
     864             :   "\n\t" "       movdqa  .Lk_mc_forward+48(%rcx), %xmm5"
     865             :   "\n\t" "       lea     16(%rdx), %rdx"
     866             :   "\n\t" "       neg     %rax"
     867             :   "\n\t" "       jmp     .Laes_dec_entry"
     868             : 
     869             :   "\n\t" ".align 16"
     870             :   "\n\t" ".Laes_dec_loop:"
     871             :   "\n\t" "##"
     872             :   "\n\t" "##  Inverse mix columns"
     873             :   "\n\t" "##"
     874             :   "\n\t" "       movdqa  %xmm13, %xmm4           # 4 : sb9u"
     875             :   "\n\t" "       pshufb  %xmm2,  %xmm4           # 4 = sb9u"
     876             :   "\n\t" "       pxor    (%rdx), %xmm4"
     877             :   "\n\t" "       movdqa  %xmm12, %xmm0           # 0 : sb9t"
     878             :   "\n\t" "       pshufb  %xmm3,  %xmm0           # 0 = sb9t"
     879             :   "\n\t" "       movdqa  .Lk_dsbd+16(%rcx),%xmm1 # 1 : sbdt"
     880             :   "\n\t" "       pxor    %xmm4,  %xmm0           # 0 = ch"
     881             :   "\n\t" "       lea     16(%rdx), %rdx          # next round key"
     882             : 
     883             :   "\n\t" "       pshufb  %xmm5,  %xmm0           # MC ch"
     884             :   "\n\t" "       movdqa  %xmm15, %xmm4           # 4 : sbdu"
     885             :   "\n\t" "       pshufb  %xmm2,  %xmm4           # 4 = sbdu"
     886             :   "\n\t" "       pxor    %xmm0,  %xmm4           # 4 = ch"
     887             :   "\n\t" "       pshufb  %xmm3,  %xmm1           # 1 = sbdt"
     888             :   "\n\t" "       pxor    %xmm4,  %xmm1           # 1 = ch"
     889             : 
     890             :   "\n\t" "       pshufb  %xmm5,  %xmm1           # MC ch"
     891             :   "\n\t" "       movdqa  %xmm14, %xmm4           # 4 : sbbu"
     892             :   "\n\t" "       pshufb  %xmm2,  %xmm4           # 4 = sbbu"
     893             :   "\n\t" "      inc     %rax                    # nr--"
     894             :   "\n\t" "       pxor    %xmm1,  %xmm4           # 4 = ch"
     895             :   "\n\t" "       movdqa  .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt"
     896             :   "\n\t" "       pshufb  %xmm3,  %xmm0           # 0 = sbbt"
     897             :   "\n\t" "       pxor    %xmm4,  %xmm0           # 0 = ch"
     898             : 
     899             :   "\n\t" "       pshufb  %xmm5,  %xmm0           # MC ch"
     900             :   "\n\t" "       movdqa  %xmm8,  %xmm4           # 4 : sbeu"
     901             :   "\n\t" "       pshufb  %xmm2,  %xmm4           # 4 = sbeu"
     902             :   "\n\t" "       pshufd  $0x93,  %xmm5,  %xmm5"
     903             :   "\n\t" "       pxor    %xmm0,  %xmm4           # 4 = ch"
     904             :   "\n\t" "       movdqa  .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet"
     905             :   "\n\t" "       pshufb  %xmm3,  %xmm0           # 0 = sbet"
     906             :   "\n\t" "       pxor    %xmm4,  %xmm0           # 0 = ch"
     907             : 
     908             :   "\n\t" ".Laes_dec_entry:"
     909             :   "\n\t" "       # top of round"
     910             :   "\n\t" "       movdqa  %xmm9,  %xmm1   # 1 : i"
     911             :   "\n\t" "       pandn   %xmm0,  %xmm1   # 1 = i<<4"
     912             :   "\n\t" "       psrld   $4,     %xmm1   # 1 = i"
     913             :   "\n\t" "       pand    %xmm9,  %xmm0   # 0 = k"
     914             :   "\n\t" "       movdqa  %xmm11, %xmm2   # 2 : a/k"
     915             :   "\n\t" "       pshufb  %xmm0,  %xmm2   # 2 = a/k"
     916             :   "\n\t" "       pxor    %xmm1,  %xmm0   # 0 = j"
     917             :   "\n\t" "       movdqa  %xmm10, %xmm3   # 3 : 1/i"
     918             :   "\n\t" "       pshufb  %xmm1,  %xmm3   # 3 = 1/i"
     919             :   "\n\t" "       pxor    %xmm2,  %xmm3   # 3 = iak = 1/i + a/k"
     920             :   "\n\t" "       movdqa  %xmm10, %xmm4   # 4 : 1/j"
     921             :   "\n\t" "       pshufb  %xmm0,  %xmm4   # 4 = 1/j"
     922             :   "\n\t" "       pxor    %xmm2,  %xmm4   # 4 = jak = 1/j + a/k"
     923             :   "\n\t" "       movdqa  %xmm10, %xmm2   # 2 : 1/iak"
     924             :   "\n\t" "       pshufb  %xmm3,  %xmm2   # 2 = 1/iak"
     925             :   "\n\t" "       pxor    %xmm0,  %xmm2   # 2 = io"
     926             :   "\n\t" "       movdqa  %xmm10, %xmm3   # 3 : 1/jak"
     927             :   "\n\t" "       pshufb  %xmm4,  %xmm3   # 3 = 1/jak"
     928             :   "\n\t" "       pxor    %xmm1,  %xmm3   # 3 = jo"
     929             :   "\n\t" "       jnz     .Laes_dec_loop"
     930             : 
     931             :   "\n\t" "       # middle of last round"
     932             :   "\n\t" "       movdqa  .Lk_dsbo(%rcx), %xmm4           # 3 : sbou"
     933             :   "\n\t" "       pshufb  %xmm2,  %xmm4   # 4 = sbou"
     934             :   "\n\t" "       pxor    (%rdx), %xmm4   # 4 = sb1u + k"
     935             :   "\n\t" "       movdqa  .Lk_dsbo+16(%rcx), %xmm0        # 0 : sbot"
     936             :   "\n\t" "       pshufb  %xmm3,  %xmm0   # 0 = sb1t"
     937             :   "\n\t" "       pxor    %xmm4,  %xmm0   # 0 = A"
     938             :   "\n\t" "       pshufb  .Lk_sr(%rsi,%rcx), %xmm0"
     939             :   "\n\t" "       ret"
     940             : X("\n\t" ".size _aes_decrypt_core,.-_aes_decrypt_core")
     941             : 
     942             :   "\n\t" "########################################################"
     943             :   "\n\t" "##                                                    ##"
     944             :   "\n\t" "##                  AES key schedule                  ##"
     945             :   "\n\t" "##                                                    ##"
     946             :   "\n\t" "########################################################"
     947             : 
     948             :   "\n\t" ".align 16"
     949             : X("\n\t" ".type _aes_schedule_core,@function")
     950             :   "\n\t" "_aes_schedule_core:"
     951             :   "\n\t" "       # rdi = key"
     952             :   "\n\t" "       # rsi = size in bits"
     953             :   "\n\t" "       # rdx = buffer"
     954             :   "\n\t" "       # rcx = direction.  0=encrypt, 1=decrypt"
     955             : 
     956             :   "\n\t" "       # load the tables"
     957             :   "\n\t" "       lea     .Laes_consts(%rip), %r10"
     958             :   "\n\t" "       movdqa            (%r10), %xmm9  # 0F"
     959             :   "\n\t" "       movdqa  .Lk_inv   (%r10), %xmm10 # inv"
     960             :   "\n\t" "       movdqa  .Lk_inv+16(%r10), %xmm11 # inva"
     961             :   "\n\t" "       movdqa  .Lk_sb1   (%r10), %xmm13 # sb1u"
     962             :   "\n\t" "       movdqa  .Lk_sb1+16(%r10), %xmm12 # sb1t"
     963             :   "\n\t" "       movdqa  .Lk_sb2   (%r10), %xmm15 # sb2u"
     964             :   "\n\t" "       movdqa  .Lk_sb2+16(%r10), %xmm14 # sb2t"
     965             : 
     966             :   "\n\t" "       movdqa  .Lk_rcon(%r10), %xmm8   # load rcon"
     967             :   "\n\t" "       movdqu  (%rdi), %xmm0           # load key (unaligned)"
     968             : 
     969             :   "\n\t" "       # input transform"
     970             :   "\n\t" "       movdqu  %xmm0,  %xmm3"
     971             :   "\n\t" "       lea     .Lk_ipt(%r10), %r11"
     972             :   "\n\t" "       call    .Laes_schedule_transform"
     973             :   "\n\t" "       movdqu  %xmm0,  %xmm7"
     974             : 
     975             :   "\n\t" "       test    %rcx,   %rcx"
     976             :   "\n\t" "       jnz     .Laes_schedule_am_decrypting"
     977             : 
     978             :   "\n\t" "       # encrypting, output zeroth round key after transform"
     979             :   "\n\t" "       movdqa  %xmm0,  (%rdx)"
     980             :   "\n\t" "       jmp     .Laes_schedule_go"
     981             : 
     982             :   "\n\t" ".Laes_schedule_am_decrypting:"
     983             :   "\n\t" "       # decrypting, output zeroth round key after shiftrows"
     984             :   "\n\t" "       pshufb  .Lk_sr(%r8,%r10),%xmm3"
     985             :   "\n\t" "       movdqa  %xmm3,  (%rdx)"
     986             :   "\n\t" "       xor     $48,    %r8"
     987             : 
     988             :   "\n\t" ".Laes_schedule_go:"
     989             :   "\n\t" "       cmp     $192,   %rsi"
     990             :   "\n\t" "       je      .Laes_schedule_192"
     991             :   "\n\t" "       cmp     $256,   %rsi"
     992             :   "\n\t" "       je      .Laes_schedule_256"
     993             :   "\n\t" "       # 128: fall though"
     994             : 
     995             :   "\n\t" "##"
     996             :   "\n\t" "##  .Laes_schedule_128"
     997             :   "\n\t" "##"
     998             :   "\n\t" "##  128-bit specific part of key schedule."
     999             :   "\n\t" "##"
    1000             :   "\n\t" "##  This schedule is really simple, because all its parts"
    1001             :   "\n\t" "##  are accomplished by the subroutines."
    1002             :   "\n\t" "##"
    1003             :   "\n\t" ".Laes_schedule_128:"
    1004             :   "\n\t" "       mov     $10, %rsi"
    1005             : 
    1006             :   "\n\t" ".Laes_schedule_128_L:"
    1007             :   "\n\t" "       call    .Laes_schedule_round"
    1008             :   "\n\t" "       dec     %rsi"
    1009             :   "\n\t" "       jz      .Laes_schedule_mangle_last"
    1010             :   "\n\t" "       call    .Laes_schedule_mangle   # write output"
    1011             :   "\n\t" "       jmp     .Laes_schedule_128_L"
    1012             : 
    1013             :   "\n\t" "##"
    1014             :   "\n\t" "##  .Laes_schedule_192"
    1015             :   "\n\t" "##"
    1016             :   "\n\t" "##  192-bit specific part of key schedule."
    1017             :   "\n\t" "##"
    1018             :   "\n\t" "##  The main body of this schedule is the same as the 128-bit"
    1019             :   "\n\t" "##  schedule, but with more smearing.  The long, high side is"
    1020             :   "\n\t" "##  stored in %xmm7 as before, and the short, low side is in"
    1021             :   "\n\t" "##  the high bits of %xmm6."
    1022             :   "\n\t" "##"
    1023             :   "\n\t" "##  This schedule is somewhat nastier, however, because each"
    1024             :   "\n\t" "##  round produces 192 bits of key material, or 1.5 round keys."
    1025             :   "\n\t" "##  Therefore, on each cycle we do 2 rounds and produce 3 round"
    1026             :   "\n\t" "##  keys."
    1027             :   "\n\t" "##"
    1028             :   "\n\t" ".Laes_schedule_192:"
    1029             :   "\n\t" "       movdqu  8(%rdi),%xmm0           # load key part 2 (very unaligned)"
    1030             :   "\n\t" "       call    .Laes_schedule_transform        # input transform"
    1031             :   "\n\t" "       pshufd  $0x0E,  %xmm0,  %xmm6"
    1032             :   "\n\t" "       pslldq  $8,     %xmm6           # clobber low side with zeros"
    1033             :   "\n\t" "       mov     $4,     %rsi"
    1034             : 
    1035             :   "\n\t" ".Laes_schedule_192_L:"
    1036             :   "\n\t" "       call    .Laes_schedule_round"
    1037             :   "\n\t" "       palignr $8,%xmm6,%xmm0  "
    1038             :   "\n\t" "       call    .Laes_schedule_mangle   # save key n"
    1039             :   "\n\t" "       call    .Laes_schedule_192_smear"
    1040             :   "\n\t" "       call    .Laes_schedule_mangle   # save key n+1"
    1041             :   "\n\t" "       call    .Laes_schedule_round"
    1042             :   "\n\t" "       dec     %rsi"
    1043             :   "\n\t" "       jz      .Laes_schedule_mangle_last"
    1044             :   "\n\t" "       call    .Laes_schedule_mangle   # save key n+2"
    1045             :   "\n\t" "       call    .Laes_schedule_192_smear"
    1046             :   "\n\t" "       jmp     .Laes_schedule_192_L"
    1047             : 
    1048             :   "\n\t" "##"
    1049             :   "\n\t" "##  .Laes_schedule_192_smear"
    1050             :   "\n\t" "##"
    1051             :   "\n\t" "##  Smear the short, low side in the 192-bit key schedule."
    1052             :   "\n\t" "##"
    1053             :   "\n\t" "##  Inputs:"
    1054             :   "\n\t" "##    %xmm7: high side, b  a  x  y"
    1055             :   "\n\t" "##    %xmm6:  low side, d  c  0  0"
    1056             :   "\n\t" "##    %xmm13: 0"
    1057             :   "\n\t" "##"
    1058             :   "\n\t" "##  Outputs:"
    1059             :   "\n\t" "##    %xmm6: b+c+d  b+c  0  0"
    1060             :   "\n\t" "##    %xmm0: b+c+d  b+c  b  a"
    1061             :   "\n\t" "##"
    1062             :   "\n\t" ".Laes_schedule_192_smear:"
    1063             :   "\n\t" "       pshufd  $0x80,  %xmm6,  %xmm0   # d c 0 0 -> c 0 0 0"
    1064             :   "\n\t" "       pxor    %xmm0,  %xmm6           # -> c+d c 0 0"
    1065             :   "\n\t" "       pshufd  $0xFE,  %xmm7,  %xmm0   # b a _ _ -> b b b a"
    1066             :   "\n\t" "       pxor    %xmm6,  %xmm0           # -> b+c+d b+c b a"
    1067             :   "\n\t" "       pshufd  $0x0E,  %xmm0,  %xmm6"
    1068             :   "\n\t" "       pslldq  $8,     %xmm6           # clobber low side with zeros"
    1069             :   "\n\t" "       ret"
    1070             : 
    1071             :   "\n\t" "##"
    1072             :   "\n\t" "##  .Laes_schedule_256"
    1073             :   "\n\t" "##"
    1074             :   "\n\t" "##  256-bit specific part of key schedule."
    1075             :   "\n\t" "##"
    1076             :   "\n\t" "##  The structure here is very similar to the 128-bit"
    1077             :   "\n\t" "##  schedule, but with an additional 'low side' in"
    1078             :   "\n\t" "##  %xmm6.  The low side's rounds are the same as the"
    1079             :   "\n\t" "##  high side's, except no rcon and no rotation."
    1080             :   "\n\t" "##"
    1081             :   "\n\t" ".Laes_schedule_256:"
    1082             :   "\n\t" "       movdqu  16(%rdi),%xmm0          # load key part 2 (unaligned)"
    1083             :   "\n\t" "       call    .Laes_schedule_transform        # input transform"
    1084             :   "\n\t" "       mov     $7, %rsi"
    1085             : 
    1086             :   "\n\t" ".Laes_schedule_256_L:"
    1087             :   "\n\t" "       call    .Laes_schedule_mangle   # output low result"
    1088             :   "\n\t" "       movdqa  %xmm0,  %xmm6           # save cur_lo in xmm6"
    1089             : 
    1090             :   "\n\t" "       # high round"
    1091             :   "\n\t" "       call    .Laes_schedule_round"
    1092             :   "\n\t" "       dec     %rsi"
    1093             :   "\n\t" "       jz      .Laes_schedule_mangle_last"
    1094             :   "\n\t" "       call    .Laes_schedule_mangle   "
    1095             : 
    1096             :   "\n\t" "       # low round. swap xmm7 and xmm6"
    1097             :   "\n\t" "       pshufd  $0xFF,  %xmm0,  %xmm0"
    1098             :   "\n\t" "       movdqa  %xmm7,  %xmm5"
    1099             :   "\n\t" "       movdqa  %xmm6,  %xmm7"
    1100             :   "\n\t" "       call    .Laes_schedule_low_round"
    1101             :   "\n\t" "       movdqa  %xmm5,  %xmm7"
    1102             : 
    1103             :   "\n\t" "       jmp     .Laes_schedule_256_L"
    1104             : 
    1105             :   "\n\t" "##"
    1106             :   "\n\t" "##  .Laes_schedule_round"
    1107             :   "\n\t" "##"
    1108             :   "\n\t" "##  Runs one main round of the key schedule on %xmm0, %xmm7"
    1109             :   "\n\t" "##"
    1110             :   "\n\t" "##  Specifically, runs subbytes on the high dword of %xmm0"
    1111             :   "\n\t" "##  then rotates it by one byte and xors into the low dword of"
    1112             :   "\n\t" "##  %xmm7."
    1113             :   "\n\t" "##"
    1114             :   "\n\t" "##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for"
    1115             :   "\n\t" "##  next rcon."
    1116             :   "\n\t" "##"
    1117             :   "\n\t" "##  Smears the dwords of %xmm7 by xoring the low into the"
    1118             :   "\n\t" "##  second low, result into third, result into highest."
    1119             :   "\n\t" "##"
    1120             :   "\n\t" "##  Returns results in %xmm7 = %xmm0."
    1121             :   "\n\t" "##  Clobbers %xmm1-%xmm4, %r11."
    1122             :   "\n\t" "##"
    1123             :   "\n\t" ".Laes_schedule_round:"
    1124             :   "\n\t" "       # extract rcon from xmm8"
    1125             :   "\n\t" "       pxor    %xmm1,  %xmm1"
    1126             :   "\n\t" "       palignr $15,    %xmm8,  %xmm1"
    1127             :   "\n\t" "       palignr $15,    %xmm8,  %xmm8"
    1128             :   "\n\t" "       pxor    %xmm1,  %xmm7"
    1129             : 
    1130             :   "\n\t" "       # rotate"
    1131             :   "\n\t" "       pshufd  $0xFF,  %xmm0,  %xmm0"
    1132             :   "\n\t" "       palignr $1,     %xmm0,  %xmm0"
    1133             : 
    1134             :   "\n\t" "       # fall through..."
    1135             : 
    1136             :   "\n\t" "       # low round: same as high round, but no rotation and no rcon."
    1137             :   "\n\t" ".Laes_schedule_low_round:"
    1138             :   "\n\t" "       # smear xmm7"
    1139             :   "\n\t" "       movdqa  %xmm7,  %xmm1"
    1140             :   "\n\t" "       pslldq  $4,     %xmm7"
    1141             :   "\n\t" "       pxor    %xmm1,  %xmm7"
    1142             :   "\n\t" "       movdqa  %xmm7,  %xmm1"
    1143             :   "\n\t" "       pslldq  $8,     %xmm7"
    1144             :   "\n\t" "       pxor    %xmm1,  %xmm7"
    1145             :   "\n\t" "       pxor    .Lk_s63(%r10), %xmm7"
    1146             : 
    1147             :   "\n\t" "       # subbytes"
    1148             :   "\n\t" "       movdqa  %xmm9,  %xmm1"
    1149             :   "\n\t" "       pandn   %xmm0,  %xmm1"
    1150             :   "\n\t" "       psrld   $4,     %xmm1           # 1 = i"
    1151             :   "\n\t" "       pand    %xmm9,  %xmm0           # 0 = k"
    1152             :   "\n\t" "       movdqa  %xmm11, %xmm2           # 2 : a/k"
    1153             :   "\n\t" "       pshufb  %xmm0,  %xmm2           # 2 = a/k"
    1154             :   "\n\t" "       pxor    %xmm1,  %xmm0           # 0 = j"
    1155             :   "\n\t" "       movdqa  %xmm10, %xmm3           # 3 : 1/i"
    1156             :   "\n\t" "       pshufb  %xmm1,  %xmm3           # 3 = 1/i"
    1157             :   "\n\t" "       pxor    %xmm2,  %xmm3           # 3 = iak = 1/i + a/k"
    1158             :   "\n\t" "       movdqa  %xmm10, %xmm4           # 4 : 1/j"
    1159             :   "\n\t" "       pshufb  %xmm0,  %xmm4           # 4 = 1/j"
    1160             :   "\n\t" "       pxor    %xmm2,  %xmm4           # 4 = jak = 1/j + a/k"
    1161             :   "\n\t" "       movdqa  %xmm10, %xmm2           # 2 : 1/iak"
    1162             :   "\n\t" "       pshufb  %xmm3,  %xmm2           # 2 = 1/iak"
    1163             :   "\n\t" "       pxor    %xmm0,  %xmm2           # 2 = io"
    1164             :   "\n\t" "       movdqa  %xmm10, %xmm3           # 3 : 1/jak"
    1165             :   "\n\t" "       pshufb  %xmm4,  %xmm3           # 3 = 1/jak"
    1166             :   "\n\t" "       pxor    %xmm1,  %xmm3           # 3 = jo"
    1167             :   "\n\t" "       movdqa  .Lk_sb1(%r10), %xmm4    # 4 : sbou"
    1168             :   "\n\t" "       pshufb  %xmm2,  %xmm4           # 4 = sbou"
    1169             :   "\n\t" "       movdqa  .Lk_sb1+16(%r10), %xmm0 # 0 : sbot"
    1170             :   "\n\t" "       pshufb  %xmm3,  %xmm0           # 0 = sb1t"
    1171             :   "\n\t" "       pxor    %xmm4,  %xmm0           # 0 = sbox output"
    1172             : 
    1173             :   "\n\t" "       # add in smeared stuff"
    1174             :   "\n\t" "       pxor    %xmm7,  %xmm0   "
    1175             :   "\n\t" "       movdqa  %xmm0,  %xmm7"
    1176             :   "\n\t" "       ret"
    1177             : 
    1178             :   "\n\t" "##"
    1179             :   "\n\t" "##  .Laes_schedule_transform"
    1180             :   "\n\t" "##"
    1181             :   "\n\t" "##  Linear-transform %xmm0 according to tables at (%r11)"
    1182             :   "\n\t" "##"
    1183             :   "\n\t" "##  Requires that %xmm9 = 0x0F0F... as in preheat"
    1184             :   "\n\t" "##  Output in %xmm0"
    1185             :   "\n\t" "##  Clobbers %xmm1, %xmm2"
    1186             :   "\n\t" "##"
    1187             :   "\n\t" ".Laes_schedule_transform:"
    1188             :   "\n\t" "       movdqa  %xmm9,  %xmm1"
    1189             :   "\n\t" "       pandn   %xmm0,  %xmm1"
    1190             :   "\n\t" "       psrld   $4,     %xmm1"
    1191             :   "\n\t" "       pand    %xmm9,  %xmm0"
    1192             :   "\n\t" "       movdqa  (%r11), %xmm2   # lo"
    1193             :   "\n\t" "       pshufb  %xmm0,  %xmm2"
    1194             :   "\n\t" "       movdqa  16(%r11), %xmm0 # hi"
    1195             :   "\n\t" "       pshufb  %xmm1,  %xmm0"
    1196             :   "\n\t" "       pxor    %xmm2,  %xmm0"
    1197             :   "\n\t" "       ret"
    1198             : 
    1199             :   "\n\t" "##"
    1200             :   "\n\t" "##  .Laes_schedule_mangle"
    1201             :   "\n\t" "##"
    1202             :   "\n\t" "##  Mangle xmm0 from (basis-transformed) standard version"
    1203             :   "\n\t" "##  to our version."
    1204             :   "\n\t" "##"
    1205             :   "\n\t" "##  On encrypt,"
    1206             :   "\n\t" "##    xor with 0x63"
    1207             :   "\n\t" "##    multiply by circulant 0,1,1,1"
    1208             :   "\n\t" "##    apply shiftrows transform"
    1209             :   "\n\t" "##"
    1210             :   "\n\t" "##  On decrypt,"
    1211             :   "\n\t" "##    xor with 0x63"
    1212             :   "\n\t" "##    multiply by 'inverse mixcolumns' circulant E,B,D,9"
    1213             :   "\n\t" "##    deskew"
    1214             :   "\n\t" "##    apply shiftrows transform"
    1215             :   "\n\t" "##"
    1216             :   "\n\t" "##"
    1217             :   "\n\t" "##  Writes out to (%rdx), and increments or decrements it"
    1218             :   "\n\t" "##  Keeps track of round number mod 4 in %r8"
    1219             :   "\n\t" "##  Preserves xmm0"
    1220             :   "\n\t" "##  Clobbers xmm1-xmm5"
    1221             :   "\n\t" "##"
    1222             :   "\n\t" ".Laes_schedule_mangle:"
    1223             :   "\n\t" "       movdqa  %xmm0,  %xmm4   # save xmm0 for later"
    1224             :   "\n\t" "       movdqa  .Lk_mc_forward(%r10),%xmm5"
    1225             :   "\n\t" "       test    %rcx,   %rcx"
    1226             :   "\n\t" "       jnz     .Laes_schedule_mangle_dec"
    1227             : 
    1228             :   "\n\t" "       # encrypting"
    1229             :   "\n\t" "       add     $16,    %rdx"
    1230             :   "\n\t" "       pxor    .Lk_s63(%r10),%xmm4"
    1231             :   "\n\t" "       pshufb  %xmm5,  %xmm4"
    1232             :   "\n\t" "       movdqa  %xmm4,  %xmm3"
    1233             :   "\n\t" "       pshufb  %xmm5,  %xmm4"
    1234             :   "\n\t" "       pxor    %xmm4,  %xmm3"
    1235             :   "\n\t" "       pshufb  %xmm5,  %xmm4"
    1236             :   "\n\t" "       pxor    %xmm4,  %xmm3"
    1237             : 
    1238             :   "\n\t" "       jmp     .Laes_schedule_mangle_both"
    1239             : 
    1240             :   "\n\t" ".Laes_schedule_mangle_dec:"
    1241             :   "\n\t" "       lea     .Lk_dks_1(%r10), %r11   # first table: *9"
    1242             :   "\n\t" "       call    .Laes_schedule_transform"
    1243             :   "\n\t" "       movdqa  %xmm0,  %xmm3"
    1244             :   "\n\t" "       pshufb  %xmm5,  %xmm3"
    1245             : 
    1246             :   "\n\t" "       add     $32,    %r11            # next table:  *B"
    1247             :   "\n\t" "       call    .Laes_schedule_transform"
    1248             :   "\n\t" "       pxor    %xmm0,  %xmm3"
    1249             :   "\n\t" "       pshufb  %xmm5,  %xmm3"
    1250             : 
    1251             :   "\n\t" "       add     $32,    %r11            # next table:  *D"
    1252             :   "\n\t" "       call    .Laes_schedule_transform"
    1253             :   "\n\t" "       pxor    %xmm0,  %xmm3"
    1254             :   "\n\t" "       pshufb  %xmm5,  %xmm3"
    1255             : 
    1256             :   "\n\t" "       add     $32,    %r11            # next table:  *E"
    1257             :   "\n\t" "       call    .Laes_schedule_transform"
    1258             :   "\n\t" "       pxor    %xmm0,  %xmm3"
    1259             :   "\n\t" "       pshufb  %xmm5,  %xmm3"
    1260             : 
    1261             :   "\n\t" "       movdqa  %xmm4,  %xmm0           # restore %xmm0"
    1262             :   "\n\t" "       add     $-16,   %rdx"
    1263             : 
    1264             :   "\n\t" ".Laes_schedule_mangle_both:"
    1265             :   "\n\t" "       pshufb  .Lk_sr(%r8,%r10),%xmm3"
    1266             :   "\n\t" "       add     $-16,   %r8"
    1267             :   "\n\t" "       and     $48,    %r8"
    1268             :   "\n\t" "       movdqa  %xmm3,  (%rdx)"
    1269             :   "\n\t" "       ret"
    1270             : 
    1271             :   "\n\t" "##"
    1272             :   "\n\t" "##  .Laes_schedule_mangle_last"
    1273             :   "\n\t" "##"
    1274             :   "\n\t" "##  Mangler for last round of key schedule"
    1275             :   "\n\t" "##  Mangles %xmm0"
    1276             :   "\n\t" "##    when encrypting, outputs out(%xmm0) ^ 63"
    1277             :   "\n\t" "##    when decrypting, outputs unskew(%xmm0)"
    1278             :   "\n\t" "##"
    1279             :   "\n\t" "##  Always called right before return... jumps to cleanup and exits"
    1280             :   "\n\t" "##"
    1281             :   "\n\t" ".Laes_schedule_mangle_last:"
    1282             :   "\n\t" "       # schedule last round key from xmm0"
    1283             :   "\n\t" "       lea     .Lk_deskew(%r10),%r11   # prepare to deskew"
    1284             :   "\n\t" "       test    %rcx,   %rcx"
    1285             :   "\n\t" "       jnz     .Laes_schedule_mangle_last_dec"
    1286             : 
    1287             :   "\n\t" "       # encrypting"
    1288             :   "\n\t" "       pshufb  .Lk_sr(%r8,%r10),%xmm0  # output permute"
    1289             :   "\n\t" "       lea     .Lk_opt(%r10),  %r11    # prepare to output transform"
    1290             :   "\n\t" "       add     $32,    %rdx"
    1291             : 
    1292             :   "\n\t" ".Laes_schedule_mangle_last_dec:"
    1293             :   "\n\t" "       add     $-16,   %rdx"
    1294             :   "\n\t" "       pxor    .Lk_s63(%r10),  %xmm0"
    1295             :   "\n\t" "       call    .Laes_schedule_transform # output transform"
    1296             :   "\n\t" "       movdqa  %xmm0,  (%rdx)          # save last key"
    1297             : 
    1298             :   "\n\t" "       #_aes_cleanup"
    1299             :   "\n\t" "       pxor    %xmm0,  %xmm0"
    1300             :   "\n\t" "       pxor    %xmm1,  %xmm1"
    1301             :   "\n\t" "       pxor    %xmm2,  %xmm2"
    1302             :   "\n\t" "       pxor    %xmm3,  %xmm3"
    1303             :   "\n\t" "       pxor    %xmm4,  %xmm4"
    1304             :   "\n\t" "       pxor    %xmm5,  %xmm5"
    1305             :   "\n\t" "       pxor    %xmm6,  %xmm6"
    1306             :   "\n\t" "       pxor    %xmm7,  %xmm7"
    1307             :   "\n\t" "       pxor    %xmm8,  %xmm8"
    1308             :   "\n\t" "       ret"
    1309             : X("\n\t" ".size _aes_schedule_core,.-_aes_schedule_core")
    1310             : 
    1311             :   "\n\t" "########################################################"
    1312             :   "\n\t" "##                                                    ##"
    1313             :   "\n\t" "##                     Constants                      ##"
    1314             :   "\n\t" "##                                                    ##"
    1315             :   "\n\t" "########################################################"
    1316             : 
    1317             :   "\n\t" ".align 16"
    1318             : X("\n\t" ".type _aes_consts,@object")
    1319             :   "\n\t" ".Laes_consts:"
    1320             :   "\n\t" "_aes_consts:"
    1321             :   "\n\t" "       # s0F"
    1322             :   "\n\t" "       .Lk_s0F = .-.Laes_consts"
    1323             :   "\n\t" "       .quad   0x0F0F0F0F0F0F0F0F"
    1324             :   "\n\t" "       .quad   0x0F0F0F0F0F0F0F0F"
    1325             : 
    1326             :   "\n\t" "       # input transform (lo, hi)"
    1327             :   "\n\t" "       .Lk_ipt = .-.Laes_consts"
    1328             :   "\n\t" "       .quad   0xC2B2E8985A2A7000"
    1329             :   "\n\t" "       .quad   0xCABAE09052227808"
    1330             :   "\n\t" "       .quad   0x4C01307D317C4D00"
    1331             :   "\n\t" "       .quad   0xCD80B1FCB0FDCC81"
    1332             : 
    1333             :   "\n\t" "       # inv, inva"
    1334             :   "\n\t" "       .Lk_inv = .-.Laes_consts"
    1335             :   "\n\t" "       .quad   0x0E05060F0D080180"
    1336             :   "\n\t" "       .quad   0x040703090A0B0C02"
    1337             :   "\n\t" "       .quad   0x01040A060F0B0780"
    1338             :   "\n\t" "       .quad   0x030D0E0C02050809"
    1339             : 
    1340             :   "\n\t" "       # sb1u, sb1t"
    1341             :   "\n\t" "       .Lk_sb1 = .-.Laes_consts"
    1342             :   "\n\t" "       .quad   0xB19BE18FCB503E00"
    1343             :   "\n\t" "       .quad   0xA5DF7A6E142AF544"
    1344             :   "\n\t" "       .quad   0x3618D415FAE22300"
    1345             :   "\n\t" "       .quad   0x3BF7CCC10D2ED9EF"
    1346             : 
    1347             : 
    1348             :   "\n\t" "       # sb2u, sb2t"
    1349             :   "\n\t" "       .Lk_sb2 = .-.Laes_consts"
    1350             :   "\n\t" "       .quad   0xE27A93C60B712400"
    1351             :   "\n\t" "       .quad   0x5EB7E955BC982FCD"
    1352             :   "\n\t" "       .quad   0x69EB88400AE12900"
    1353             :   "\n\t" "       .quad   0xC2A163C8AB82234A"
    1354             : 
    1355             :   "\n\t" "       # sbou, sbot"
    1356             :   "\n\t" "       .Lk_sbo = .-.Laes_consts"
    1357             :   "\n\t" "       .quad   0xD0D26D176FBDC700"
    1358             :   "\n\t" "       .quad   0x15AABF7AC502A878"
    1359             :   "\n\t" "       .quad   0xCFE474A55FBB6A00"
    1360             :   "\n\t" "       .quad   0x8E1E90D1412B35FA"
    1361             : 
    1362             :   "\n\t" "       # mc_forward"
    1363             :   "\n\t" "       .Lk_mc_forward = .-.Laes_consts"
    1364             :   "\n\t" "       .quad   0x0407060500030201"
    1365             :   "\n\t" "       .quad   0x0C0F0E0D080B0A09"
    1366             :   "\n\t" "       .quad   0x080B0A0904070605"
    1367             :   "\n\t" "       .quad   0x000302010C0F0E0D"
    1368             :   "\n\t" "       .quad   0x0C0F0E0D080B0A09"
    1369             :   "\n\t" "       .quad   0x0407060500030201"
    1370             :   "\n\t" "       .quad   0x000302010C0F0E0D"
    1371             :   "\n\t" "       .quad   0x080B0A0904070605"
    1372             : 
    1373             :   "\n\t" "       # mc_backward"
    1374             :   "\n\t" "       .Lk_mc_backward = .-.Laes_consts"
    1375             :   "\n\t" "       .quad   0x0605040702010003"
    1376             :   "\n\t" "       .quad   0x0E0D0C0F0A09080B"
    1377             :   "\n\t" "       .quad   0x020100030E0D0C0F"
    1378             :   "\n\t" "       .quad   0x0A09080B06050407"
    1379             :   "\n\t" "       .quad   0x0E0D0C0F0A09080B"
    1380             :   "\n\t" "       .quad   0x0605040702010003"
    1381             :   "\n\t" "       .quad   0x0A09080B06050407"
    1382             :   "\n\t" "       .quad   0x020100030E0D0C0F"
    1383             : 
    1384             :   "\n\t" "       # sr"
    1385             :   "\n\t" "       .Lk_sr = .-.Laes_consts"
    1386             :   "\n\t" "       .quad   0x0706050403020100"
    1387             :   "\n\t" "       .quad   0x0F0E0D0C0B0A0908"
    1388             :   "\n\t" "       .quad   0x030E09040F0A0500"
    1389             :   "\n\t" "       .quad   0x0B06010C07020D08"
    1390             :   "\n\t" "       .quad   0x0F060D040B020900"
    1391             :   "\n\t" "       .quad   0x070E050C030A0108"
    1392             :   "\n\t" "       .quad   0x0B0E0104070A0D00"
    1393             :   "\n\t" "       .quad   0x0306090C0F020508"
    1394             : 
    1395             :   "\n\t" "       # rcon"
    1396             :   "\n\t" "       .Lk_rcon = .-.Laes_consts"
    1397             :   "\n\t" "       .quad   0x1F8391B9AF9DEEB6"
    1398             :   "\n\t" "       .quad   0x702A98084D7C7D81"
    1399             : 
    1400             :   "\n\t" "       # s63: all equal to 0x63 transformed"
    1401             :   "\n\t" "       .Lk_s63 = .-.Laes_consts"
    1402             :   "\n\t" "       .quad   0x5B5B5B5B5B5B5B5B"
    1403             :   "\n\t" "       .quad   0x5B5B5B5B5B5B5B5B"
    1404             : 
    1405             :   "\n\t" "       # output transform"
    1406             :   "\n\t" "       .Lk_opt = .-.Laes_consts"
    1407             :   "\n\t" "       .quad   0xFF9F4929D6B66000"
    1408             :   "\n\t" "       .quad   0xF7974121DEBE6808"
    1409             :   "\n\t" "       .quad   0x01EDBD5150BCEC00"
    1410             :   "\n\t" "       .quad   0xE10D5DB1B05C0CE0"
    1411             : 
    1412             :   "\n\t" "       # deskew tables: inverts the sbox's 'skew'"
    1413             :   "\n\t" "       .Lk_deskew = .-.Laes_consts"
    1414             :   "\n\t" "       .quad   0x07E4A34047A4E300"
    1415             :   "\n\t" "       .quad   0x1DFEB95A5DBEF91A"
    1416             :   "\n\t" "       .quad   0x5F36B5DC83EA6900"
    1417             :   "\n\t" "       .quad   0x2841C2ABF49D1E77"
    1418             : 
    1419             :   "\n\t" "##"
    1420             :   "\n\t" "##  Decryption stuff"
    1421             :   "\n\t" "##  Key schedule constants"
    1422             :   "\n\t" "##"
    1423             :   "\n\t" "       # decryption key schedule: x -> invskew x*9"
    1424             :   "\n\t" "       .Lk_dks_1 = .-.Laes_consts"
    1425             :   "\n\t" "       .quad   0xB6116FC87ED9A700"
    1426             :   "\n\t" "       .quad   0x4AED933482255BFC"
    1427             :   "\n\t" "       .quad   0x4576516227143300"
    1428             :   "\n\t" "       .quad   0x8BB89FACE9DAFDCE"
    1429             : 
    1430             :   "\n\t" "       # decryption key schedule: invskew x*9 -> invskew x*D"
    1431             :   "\n\t" "       .Lk_dks_2 = .-.Laes_consts"
    1432             :   "\n\t" "       .quad   0x27438FEBCCA86400"
    1433             :   "\n\t" "       .quad   0x4622EE8AADC90561"
    1434             :   "\n\t" "       .quad   0x815C13CE4F92DD00"
    1435             :   "\n\t" "       .quad   0x73AEE13CBD602FF2"
    1436             : 
    1437             :   "\n\t" "       # decryption key schedule: invskew x*D -> invskew x*B"
    1438             :   "\n\t" "       .Lk_dks_3 = .-.Laes_consts"
    1439             :   "\n\t" "       .quad   0x03C4C50201C6C700"
    1440             :   "\n\t" "       .quad   0xF83F3EF9FA3D3CFB"
    1441             :   "\n\t" "       .quad   0xEE1921D638CFF700"
    1442             :   "\n\t" "       .quad   0xA5526A9D7384BC4B"
    1443             : 
    1444             :   "\n\t" "       # decryption key schedule: invskew x*B -> invskew x*E + 0x63"
    1445             :   "\n\t" "       .Lk_dks_4 = .-.Laes_consts"
    1446             :   "\n\t" "       .quad   0xE3C390B053732000"
    1447             :   "\n\t" "       .quad   0xA080D3F310306343"
    1448             :   "\n\t" "       .quad   0xA0CA214B036982E8"
    1449             :   "\n\t" "       .quad   0x2F45AEC48CE60D67"
    1450             : 
    1451             :   "\n\t" "##"
    1452             :   "\n\t" "##  Decryption stuff"
    1453             :   "\n\t" "##  Round function constants"
    1454             :   "\n\t" "##"
    1455             :   "\n\t" "       # decryption input transform"
    1456             :   "\n\t" "       .Lk_dipt = .-.Laes_consts"
    1457             :   "\n\t" "       .quad   0x0F505B040B545F00"
    1458             :   "\n\t" "       .quad   0x154A411E114E451A"
    1459             :   "\n\t" "       .quad   0x86E383E660056500"
    1460             :   "\n\t" "       .quad   0x12771772F491F194"
    1461             : 
    1462             :   "\n\t" "       # decryption sbox output *9*u, *9*t"
    1463             :   "\n\t" "       .Lk_dsb9 = .-.Laes_consts"
    1464             :   "\n\t" "       .quad   0x851C03539A86D600"
    1465             :   "\n\t" "       .quad   0xCAD51F504F994CC9"
    1466             :   "\n\t" "       .quad   0xC03B1789ECD74900"
    1467             :   "\n\t" "       .quad   0x725E2C9EB2FBA565"
    1468             : 
    1469             :   "\n\t" "       # decryption sbox output *D*u, *D*t"
    1470             :   "\n\t" "       .Lk_dsbd = .-.Laes_consts"
    1471             :   "\n\t" "       .quad   0x7D57CCDFE6B1A200"
    1472             :   "\n\t" "       .quad   0xF56E9B13882A4439"
    1473             :   "\n\t" "       .quad   0x3CE2FAF724C6CB00"
    1474             :   "\n\t" "       .quad   0x2931180D15DEEFD3"
    1475             : 
    1476             :   "\n\t" "       # decryption sbox output *B*u, *B*t"
    1477             :   "\n\t" "       .Lk_dsbb = .-.Laes_consts"
    1478             :   "\n\t" "       .quad   0xD022649296B44200"
    1479             :   "\n\t" "       .quad   0x602646F6B0F2D404"
    1480             :   "\n\t" "       .quad   0xC19498A6CD596700"
    1481             :   "\n\t" "       .quad   0xF3FF0C3E3255AA6B"
    1482             : 
    1483             :   "\n\t" "       # decryption sbox output *E*u, *E*t"
    1484             :   "\n\t" "       .Lk_dsbe = .-.Laes_consts"
    1485             :   "\n\t" "       .quad   0x46F2929626D4D000"
    1486             :   "\n\t" "       .quad   0x2242600464B4F6B0"
    1487             :   "\n\t" "       .quad   0x0C55A6CDFFAAC100"
    1488             :   "\n\t" "       .quad   0x9467F36B98593E32"
    1489             : 
    1490             :   "\n\t" "       # decryption sbox final output"
    1491             :   "\n\t" "       .Lk_dsbo = .-.Laes_consts"
    1492             :   "\n\t" "       .quad   0x1387EA537EF94000"
    1493             :   "\n\t" "       .quad   0xC7AA6DB9D4943E2D"
    1494             :   "\n\t" "       .quad   0x12D7560F93441D00"
    1495             :   "\n\t" "       .quad   0xCA4B8159D8C58E9C"
    1496             : X("\n\t" ".size _aes_consts,.-_aes_consts")
    1497             : );
    1498             : 
    1499             : #endif /* USE_SSSE3 */

Generated by: LCOV version 1.12