LCOV - code coverage report
Current view: top level - cipher - cipher-gcm-intel-pclmul.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 39 0.0 %
Date: 2016-12-15 12:59:22 Functions: 0 4 0.0 %

          Line data    Source code
       1             : /* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
       2             :  *                               implementation
       3             :  * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
       4             :  *
       5             :  * This file is part of Libgcrypt.
       6             :  *
       7             :  * Libgcrypt is free software; you can redistribute it and/or modify
       8             :  * it under the terms of the GNU Lesser general Public License as
       9             :  * published by the Free Software Foundation; either version 2.1 of
      10             :  * the License, or (at your option) any later version.
      11             :  *
      12             :  * Libgcrypt is distributed in the hope that it will be useful,
      13             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15             :  * GNU Lesser General Public License for more details.
      16             :  *
      17             :  * You should have received a copy of the GNU Lesser General Public
      18             :  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
      19             :  */
      20             : 
      21             : #include <config.h>
      22             : #include <stdio.h>
      23             : #include <stdlib.h>
      24             : #include <string.h>
      25             : #include <errno.h>
      26             : 
      27             : #include "g10lib.h"
      28             : #include "cipher.h"
      29             : #include "bufhelp.h"
      30             : #include "./cipher-internal.h"
      31             : 
      32             : 
      33             : #ifdef GCM_USE_INTEL_PCLMUL
      34             : 
      35             : 
      36             : #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
      37             : /* Prevent compiler from issuing SSE instructions between asm blocks. */
      38             : #  pragma GCC target("no-sse")
      39             : #endif
      40             : 
      41             : 
      42             : /*
      43             :  Intel PCLMUL ghash based on white paper:
      44             :   "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
      45             :    GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
      46             :  */
      47           0 : static inline void gfmul_pclmul(void)
      48             : {
      49             :   /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
      50             :      Input must be converted to little-endian.
      51             :    */
      52           0 :   asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
      53             :                 "pshufd $78, %%xmm0, %%xmm2\n\t"
      54             :                 "pshufd $78, %%xmm1, %%xmm4\n\t"
      55             :                 "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
      56             :                 "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
      57             : 
      58             :                 "movdqa %%xmm0, %%xmm3\n\t"
      59             :                 "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
      60             :                 "movdqa %%xmm0, %%xmm6\n\t"
      61             :                 "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
      62             :                 "movdqa %%xmm3, %%xmm5\n\t"
      63             :                 "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
      64             : 
      65             :                 "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
      66             :                 "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
      67             :                 "movdqa %%xmm4, %%xmm5\n\t"
      68             :                 "psrldq $8, %%xmm4\n\t"
      69             :                 "pslldq $8, %%xmm5\n\t"
      70             :                 "pxor %%xmm5, %%xmm3\n\t"
      71             :                 "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
      72             :                                              carry-less multiplication of xmm0
      73             :                                              by xmm1 */
      74             : 
      75             :                 /* shift the result by one bit position to the left cope for
      76             :                    the fact that bits are reversed */
      77             :                 "movdqa %%xmm3, %%xmm4\n\t"
      78             :                 "movdqa %%xmm6, %%xmm5\n\t"
      79             :                 "pslld $1, %%xmm3\n\t"
      80             :                 "pslld $1, %%xmm6\n\t"
      81             :                 "psrld $31, %%xmm4\n\t"
      82             :                 "psrld $31, %%xmm5\n\t"
      83             :                 "movdqa %%xmm4, %%xmm1\n\t"
      84             :                 "pslldq $4, %%xmm5\n\t"
      85             :                 "pslldq $4, %%xmm4\n\t"
      86             :                 "psrldq $12, %%xmm1\n\t"
      87             :                 "por %%xmm4, %%xmm3\n\t"
      88             :                 "por %%xmm5, %%xmm6\n\t"
      89             :                 "por %%xmm6, %%xmm1\n\t"
      90             : 
      91             :                 /* first phase of the reduction */
      92             :                 "movdqa %%xmm3, %%xmm6\n\t"
      93             :                 "movdqa %%xmm3, %%xmm7\n\t"
      94             :                 "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
      95             :                 "movdqa %%xmm3, %%xmm5\n\t"
      96             :                 "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
      97             :                 "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
      98             :                 "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
      99             :                 "pxor %%xmm5, %%xmm6\n\t"
     100             :                 "movdqa %%xmm6, %%xmm7\n\t"
     101             :                 "pslldq $12, %%xmm6\n\t"
     102             :                 "psrldq $4, %%xmm7\n\t"
     103             :                 "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
     104             :                                              complete */
     105             : 
     106             :                 /* second phase of the reduction */
     107             :                 "movdqa %%xmm3, %%xmm2\n\t"
     108             :                 "movdqa %%xmm3, %%xmm4\n\t"
     109             :                 "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
     110             :                 "movdqa %%xmm3, %%xmm5\n\t"
     111             :                 "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
     112             :                 "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
     113             :                 "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
     114             :                 "pxor %%xmm5, %%xmm2\n\t"
     115             :                 "pxor %%xmm7, %%xmm2\n\t"
     116             :                 "pxor %%xmm2, %%xmm3\n\t"
     117             :                 "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
     118             :                 ::: "cc" );
     119           0 : }
     120             : 
     121             : 
     122             : #ifdef __x86_64__
     123           0 : static inline void gfmul_pclmul_aggr4(void)
     124             : {
     125             :   /* Input:
     126             :       H¹: XMM0          X_i            : XMM6
     127             :       H²: XMM8          X_(i-1)        : XMM3
     128             :       H³: XMM9          X_(i-2)        : XMM2
     129             :       H⁴: XMM10         X_(i-3)⊕Y_(i-4): XMM1
     130             :      Output:
     131             :       Y_i: XMM1
     132             :      Inputs XMM0 stays unmodified.
     133             :      Input must be converted to little-endian.
     134             :    */
     135           0 :   asm volatile (/* perform clmul and merge results... */
     136             :                 "pshufd $78, %%xmm10, %%xmm11\n\t"
     137             :                 "pshufd $78, %%xmm1, %%xmm12\n\t"
     138             :                 "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
     139             :                 "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
     140             : 
     141             :                 "pshufd $78, %%xmm9, %%xmm13\n\t"
     142             :                 "pshufd $78, %%xmm2, %%xmm14\n\t"
     143             :                 "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
     144             :                 "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
     145             : 
     146             :                 "pshufd $78, %%xmm8, %%xmm5\n\t"
     147             :                 "pshufd $78, %%xmm3, %%xmm15\n\t"
     148             :                 "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
     149             :                 "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
     150             : 
     151             :                 "movdqa %%xmm10, %%xmm4\n\t"
     152             :                 "movdqa %%xmm9, %%xmm7\n\t"
     153             :                 "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
     154             :                 "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
     155             :                 "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
     156             :                 "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
     157             :                 "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
     158             :                 "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
     159             : 
     160             :                 "pshufd $78, %%xmm0, %%xmm10\n\t"
     161             :                 "pshufd $78, %%xmm6, %%xmm11\n\t"
     162             :                 "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
     163             :                 "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
     164             : 
     165             :                 "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
     166             :                 "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
     167             :                 "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
     168             : 
     169             :                 "movdqa %%xmm8, %%xmm13\n\t"
     170             :                 "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
     171             :                 "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
     172             :                 "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
     173             : 
     174             :                 "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
     175             :                 "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
     176             :                 "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
     177             : 
     178             :                 "movdqa %%xmm0, %%xmm3\n\t"
     179             :                 "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
     180             :                 "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
     181             :                 "movdqa %%xmm11, %%xmm4\n\t"
     182             :                 "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
     183             : 
     184             :                 "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
     185             :                 "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
     186             :                 "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
     187             : 
     188             :                 /* aggregated reduction... */
     189             :                 "movdqa %%xmm3, %%xmm5\n\t"
     190             :                 "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
     191             :                 "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
     192             :                 "movdqa %%xmm4, %%xmm5\n\t"
     193             :                 "psrldq $8, %%xmm4\n\t"
     194             :                 "pslldq $8, %%xmm5\n\t"
     195             :                 "pxor %%xmm5, %%xmm3\n\t"
     196             :                 "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
     197             :                                              carry-less multiplication of xmm0
     198             :                                              by xmm1 */
     199             : 
     200             :                 /* shift the result by one bit position to the left cope for
     201             :                    the fact that bits are reversed */
     202             :                 "movdqa %%xmm3, %%xmm4\n\t"
     203             :                 "movdqa %%xmm6, %%xmm5\n\t"
     204             :                 "pslld $1, %%xmm3\n\t"
     205             :                 "pslld $1, %%xmm6\n\t"
     206             :                 "psrld $31, %%xmm4\n\t"
     207             :                 "psrld $31, %%xmm5\n\t"
     208             :                 "movdqa %%xmm4, %%xmm1\n\t"
     209             :                 "pslldq $4, %%xmm5\n\t"
     210             :                 "pslldq $4, %%xmm4\n\t"
     211             :                 "psrldq $12, %%xmm1\n\t"
     212             :                 "por %%xmm4, %%xmm3\n\t"
     213             :                 "por %%xmm5, %%xmm6\n\t"
     214             :                 "por %%xmm6, %%xmm1\n\t"
     215             : 
     216             :                 /* first phase of the reduction */
     217             :                 "movdqa %%xmm3, %%xmm6\n\t"
     218             :                 "movdqa %%xmm3, %%xmm7\n\t"
     219             :                 "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
     220             :                 "movdqa %%xmm3, %%xmm5\n\t"
     221             :                 "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
     222             :                 "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
     223             :                 "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
     224             :                 "pxor %%xmm5, %%xmm6\n\t"
     225             :                 "movdqa %%xmm6, %%xmm7\n\t"
     226             :                 "pslldq $12, %%xmm6\n\t"
     227             :                 "psrldq $4, %%xmm7\n\t"
     228             :                 "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
     229             :                                              complete */
     230             : 
     231             :                 /* second phase of the reduction */
     232             :                 "movdqa %%xmm3, %%xmm2\n\t"
     233             :                 "movdqa %%xmm3, %%xmm4\n\t"
     234             :                 "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
     235             :                 "movdqa %%xmm3, %%xmm5\n\t"
     236             :                 "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
     237             :                 "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
     238             :                 "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
     239             :                 "pxor %%xmm5, %%xmm2\n\t"
     240             :                 "pxor %%xmm7, %%xmm2\n\t"
     241             :                 "pxor %%xmm2, %%xmm3\n\t"
     242             :                 "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
     243             :                 :::"cc");
     244           0 : }
     245             : #endif
     246             : 
     247             : 
     248             : void
     249           0 : _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
     250             : {
     251             :   u64 tmp[2];
     252             : #if defined(__x86_64__) && defined(__WIN64__)
     253             :   char win64tmp[3 * 16];
     254             : 
     255             :   /* XMM6-XMM8 need to be restored after use. */
     256             :   asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
     257             :                 "movdqu %%xmm7, 1*16(%0)\n\t"
     258             :                 "movdqu %%xmm8, 2*16(%0)\n\t"
     259             :                 :
     260             :                 : "r" (win64tmp)
     261             :                 : "memory");
     262             : #endif
     263             : 
     264             :   /* Swap endianness of hsub. */
     265           0 :   tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
     266           0 :   tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
     267           0 :   buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
     268             : 
     269             : #ifdef __x86_64__
     270           0 :   asm volatile ("movdqu %[h_1], %%xmm0\n\t"
     271             :                 "movdqa %%xmm0, %%xmm1\n\t"
     272             :                 :
     273             :                 : [h_1] "m" (*tmp));
     274             : 
     275           0 :   gfmul_pclmul (); /* H•H => H² */
     276             : 
     277           0 :   asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
     278             :                 "movdqa %%xmm1, %%xmm8\n\t"
     279             :                 :
     280             :                 : [h_234] "r" (c->u_mode.gcm.gcm_table)
     281             :                 : "memory");
     282             : 
     283           0 :   gfmul_pclmul (); /* H•H² => H³ */
     284             : 
     285           0 :   asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
     286             :                 "movdqu %%xmm1, 1*16(%[h_234])\n\t"
     287             :                 "movdqa %%xmm8, %%xmm1\n\t"
     288             :                 :
     289             :                 : [h_234] "r" (c->u_mode.gcm.gcm_table)
     290             :                 : "memory");
     291             : 
     292           0 :   gfmul_pclmul (); /* H²•H² => H⁴ */
     293             : 
     294           0 :   asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
     295             :                 :
     296             :                 : [h_234] "r" (c->u_mode.gcm.gcm_table)
     297             :                 : "memory");
     298             : 
     299             : #ifdef __WIN64__
     300             :   /* Clear/restore used registers. */
     301             :   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
     302             :                 "pxor %%xmm1, %%xmm1\n\t"
     303             :                 "pxor %%xmm2, %%xmm2\n\t"
     304             :                 "pxor %%xmm3, %%xmm3\n\t"
     305             :                 "pxor %%xmm4, %%xmm4\n\t"
     306             :                 "pxor %%xmm5, %%xmm5\n\t"
     307             :                 "movdqu 0*16(%0), %%xmm6\n\t"
     308             :                 "movdqu 1*16(%0), %%xmm7\n\t"
     309             :                 "movdqu 2*16(%0), %%xmm8\n\t"
     310             :                 :
     311             :                 : "r" (win64tmp)
     312             :                 : "memory");
     313             : #else
     314             :   /* Clear used registers. */
     315           0 :   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
     316             :                 "pxor %%xmm1, %%xmm1\n\t"
     317             :                 "pxor %%xmm2, %%xmm2\n\t"
     318             :                 "pxor %%xmm3, %%xmm3\n\t"
     319             :                 "pxor %%xmm4, %%xmm4\n\t"
     320             :                 "pxor %%xmm5, %%xmm5\n\t"
     321             :                 "pxor %%xmm6, %%xmm6\n\t"
     322             :                 "pxor %%xmm7, %%xmm7\n\t"
     323             :                 "pxor %%xmm8, %%xmm8\n\t"
     324             :                 ::: "cc" );
     325             : #endif
     326             : #endif
     327             : 
     328           0 :   wipememory (tmp, sizeof(tmp));
     329           0 : }
     330             : 
     331             : 
     332             : unsigned int
     333           0 : _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
     334             :                           size_t nblocks)
     335             : {
     336             :   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     337             :     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
     338           0 :   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
     339             : #ifdef __WIN64__
     340             :   char win64tmp[10 * 16];
     341             : #endif
     342             : 
     343           0 :   if (nblocks == 0)
     344           0 :     return 0;
     345             : 
     346             : #ifdef __WIN64__
     347             :   /* XMM8-XMM15 need to be restored after use. */
     348             :   asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
     349             :                 "movdqu %%xmm7,  1*16(%0)\n\t"
     350             :                 "movdqu %%xmm8,  2*16(%0)\n\t"
     351             :                 "movdqu %%xmm9,  3*16(%0)\n\t"
     352             :                 "movdqu %%xmm10, 4*16(%0)\n\t"
     353             :                 "movdqu %%xmm11, 5*16(%0)\n\t"
     354             :                 "movdqu %%xmm12, 6*16(%0)\n\t"
     355             :                 "movdqu %%xmm13, 7*16(%0)\n\t"
     356             :                 "movdqu %%xmm14, 8*16(%0)\n\t"
     357             :                 "movdqu %%xmm15, 9*16(%0)\n\t"
     358             :                 :
     359             :                 : "r" (win64tmp)
     360             :                 : "memory" );
     361             : #endif
     362             : 
     363             :   /* Preload hash and H1. */
     364           0 :   asm volatile ("movdqu %[hash], %%xmm1\n\t"
     365             :                 "movdqa %[hsub], %%xmm0\n\t"
     366             :                 "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
     367             :                 :
     368             :                 : [hash] "m" (*result), [be_mask] "m" (*be_mask),
     369             :                   [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
     370             : 
     371             : #ifdef __x86_64__
     372           0 :   if (nblocks >= 4)
     373             :     {
     374             :       do
     375             :         {
     376           0 :           asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
     377             :                         "movdqu 0*16(%[buf]), %%xmm5\n\t"
     378             :                         "movdqu 1*16(%[buf]), %%xmm2\n\t"
     379             :                         "movdqu 2*16(%[buf]), %%xmm3\n\t"
     380             :                         "movdqu 3*16(%[buf]), %%xmm6\n\t"
     381             :                         "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
     382             : 
     383             :                         /* Load H2, H3, H4. */
     384             :                         "movdqu 2*16(%[h_234]), %%xmm10\n\t"
     385             :                         "movdqu 1*16(%[h_234]), %%xmm9\n\t"
     386             :                         "movdqu 0*16(%[h_234]), %%xmm8\n\t"
     387             : 
     388             :                         "pxor %%xmm5, %%xmm1\n\t"
     389             :                         "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
     390             :                         "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
     391             :                         "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
     392             :                         :
     393             :                         : [buf] "r" (buf), [be_mask] "m" (*be_mask),
     394             :                           [h_234] "r" (c->u_mode.gcm.gcm_table));
     395             : 
     396           0 :           gfmul_pclmul_aggr4 ();
     397             : 
     398           0 :           buf += 4 * blocksize;
     399           0 :           nblocks -= 4;
     400             :         }
     401           0 :       while (nblocks >= 4);
     402             : 
     403             : #ifndef __WIN64__
     404             :       /* Clear used x86-64/XMM registers. */
     405           0 :       asm volatile( "pxor %%xmm8, %%xmm8\n\t"
     406             :                     "pxor %%xmm9, %%xmm9\n\t"
     407             :                     "pxor %%xmm10, %%xmm10\n\t"
     408             :                     "pxor %%xmm11, %%xmm11\n\t"
     409             :                     "pxor %%xmm12, %%xmm12\n\t"
     410             :                     "pxor %%xmm13, %%xmm13\n\t"
     411             :                     "pxor %%xmm14, %%xmm14\n\t"
     412             :                     "pxor %%xmm15, %%xmm15\n\t"
     413             :                     ::: "cc" );
     414             : #endif
     415             :     }
     416             : #endif
     417             : 
     418           0 :   while (nblocks--)
     419             :     {
     420           0 :       asm volatile ("movdqu %[buf], %%xmm2\n\t"
     421             :                     "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
     422             :                     "pxor %%xmm2, %%xmm1\n\t"
     423             :                     :
     424             :                     : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
     425             : 
     426           0 :       gfmul_pclmul ();
     427             : 
     428           0 :       buf += blocksize;
     429             :     }
     430             : 
     431             :   /* Store hash. */
     432           0 :   asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
     433             :                 "movdqu %%xmm1, %[hash]\n\t"
     434             :                 : [hash] "=m" (*result)
     435             :                 : [be_mask] "m" (*be_mask));
     436             : 
     437             : #ifdef __WIN64__
     438             :   /* Clear/restore used registers. */
     439             :   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
     440             :                 "pxor %%xmm1, %%xmm1\n\t"
     441             :                 "pxor %%xmm2, %%xmm2\n\t"
     442             :                 "pxor %%xmm3, %%xmm3\n\t"
     443             :                 "pxor %%xmm4, %%xmm4\n\t"
     444             :                 "pxor %%xmm5, %%xmm5\n\t"
     445             :                 "movdqu 0*16(%0), %%xmm6\n\t"
     446             :                 "movdqu 1*16(%0), %%xmm7\n\t"
     447             :                 "movdqu 2*16(%0), %%xmm8\n\t"
     448             :                 "movdqu 3*16(%0), %%xmm9\n\t"
     449             :                 "movdqu 4*16(%0), %%xmm10\n\t"
     450             :                 "movdqu 5*16(%0), %%xmm11\n\t"
     451             :                 "movdqu 6*16(%0), %%xmm12\n\t"
     452             :                 "movdqu 7*16(%0), %%xmm13\n\t"
     453             :                 "movdqu 8*16(%0), %%xmm14\n\t"
     454             :                 "movdqu 9*16(%0), %%xmm15\n\t"
     455             :                 :
     456             :                 : "r" (win64tmp)
     457             :                 : "memory" );
     458             : #else
     459             :   /* Clear used registers. */
     460           0 :   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
     461             :                 "pxor %%xmm1, %%xmm1\n\t"
     462             :                 "pxor %%xmm2, %%xmm2\n\t"
     463             :                 "pxor %%xmm3, %%xmm3\n\t"
     464             :                 "pxor %%xmm4, %%xmm4\n\t"
     465             :                 "pxor %%xmm5, %%xmm5\n\t"
     466             :                 "pxor %%xmm6, %%xmm6\n\t"
     467             :                 "pxor %%xmm7, %%xmm7\n\t"
     468             :                 ::: "cc" );
     469             : #endif
     470             : 
     471           0 :   return 0;
     472             : }
     473             : 
     474             : #endif /* GCM_USE_INTEL_PCLMUL */

Generated by: LCOV version 1.12