Line data Source code
1 : /* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode
2 : * implementation
3 : * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4 : *
5 : * This file is part of Libgcrypt.
6 : *
7 : * Libgcrypt is free software; you can redistribute it and/or modify
8 : * it under the terms of the GNU Lesser general Public License as
9 : * published by the Free Software Foundation; either version 2.1 of
10 : * the License, or (at your option) any later version.
11 : *
12 : * Libgcrypt is distributed in the hope that it will be useful,
13 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 : * GNU Lesser General Public License for more details.
16 : *
17 : * You should have received a copy of the GNU Lesser General Public
18 : * License along with this program; if not, see <http://www.gnu.org/licenses/>.
19 : */
20 :
21 : #include <config.h>
22 : #include <stdio.h>
23 : #include <stdlib.h>
24 : #include <string.h>
25 : #include <errno.h>
26 :
27 : #include "g10lib.h"
28 : #include "cipher.h"
29 : #include "bufhelp.h"
30 : #include "./cipher-internal.h"
31 :
32 :
33 : #ifdef GCM_USE_INTEL_PCLMUL
34 :
35 :
36 : #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
37 : /* Prevent compiler from issuing SSE instructions between asm blocks. */
38 : # pragma GCC target("no-sse")
39 : #endif
40 :
41 :
42 : /*
43 : Intel PCLMUL ghash based on white paper:
44 : "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the
45 : GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
46 : */
47 0 : static inline void gfmul_pclmul(void)
48 : {
49 : /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
50 : Input must be converted to little-endian.
51 : */
52 0 : asm volatile (/* gfmul, xmm0 has operator a and xmm1 has operator b. */
53 : "pshufd $78, %%xmm0, %%xmm2\n\t"
54 : "pshufd $78, %%xmm1, %%xmm4\n\t"
55 : "pxor %%xmm0, %%xmm2\n\t" /* xmm2 holds a0+a1 */
56 : "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds b0+b1 */
57 :
58 : "movdqa %%xmm0, %%xmm3\n\t"
59 : "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */
60 : "movdqa %%xmm0, %%xmm6\n\t"
61 : "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
62 : "movdqa %%xmm3, %%xmm5\n\t"
63 : "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */
64 :
65 : "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
66 : "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
67 : "movdqa %%xmm4, %%xmm5\n\t"
68 : "psrldq $8, %%xmm4\n\t"
69 : "pslldq $8, %%xmm5\n\t"
70 : "pxor %%xmm5, %%xmm3\n\t"
71 : "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
72 : carry-less multiplication of xmm0
73 : by xmm1 */
74 :
75 : /* shift the result by one bit position to the left cope for
76 : the fact that bits are reversed */
77 : "movdqa %%xmm3, %%xmm4\n\t"
78 : "movdqa %%xmm6, %%xmm5\n\t"
79 : "pslld $1, %%xmm3\n\t"
80 : "pslld $1, %%xmm6\n\t"
81 : "psrld $31, %%xmm4\n\t"
82 : "psrld $31, %%xmm5\n\t"
83 : "movdqa %%xmm4, %%xmm1\n\t"
84 : "pslldq $4, %%xmm5\n\t"
85 : "pslldq $4, %%xmm4\n\t"
86 : "psrldq $12, %%xmm1\n\t"
87 : "por %%xmm4, %%xmm3\n\t"
88 : "por %%xmm5, %%xmm6\n\t"
89 : "por %%xmm6, %%xmm1\n\t"
90 :
91 : /* first phase of the reduction */
92 : "movdqa %%xmm3, %%xmm6\n\t"
93 : "movdqa %%xmm3, %%xmm7\n\t"
94 : "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
95 : "movdqa %%xmm3, %%xmm5\n\t"
96 : "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
97 : "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
98 : "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
99 : "pxor %%xmm5, %%xmm6\n\t"
100 : "movdqa %%xmm6, %%xmm7\n\t"
101 : "pslldq $12, %%xmm6\n\t"
102 : "psrldq $4, %%xmm7\n\t"
103 : "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
104 : complete */
105 :
106 : /* second phase of the reduction */
107 : "movdqa %%xmm3, %%xmm2\n\t"
108 : "movdqa %%xmm3, %%xmm4\n\t"
109 : "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
110 : "movdqa %%xmm3, %%xmm5\n\t"
111 : "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
112 : "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
113 : "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
114 : "pxor %%xmm5, %%xmm2\n\t"
115 : "pxor %%xmm7, %%xmm2\n\t"
116 : "pxor %%xmm2, %%xmm3\n\t"
117 : "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
118 : ::: "cc" );
119 0 : }
120 :
121 :
122 : #ifdef __x86_64__
123 0 : static inline void gfmul_pclmul_aggr4(void)
124 : {
125 : /* Input:
126 : H¹: XMM0 X_i : XMM6
127 : H²: XMM8 X_(i-1) : XMM3
128 : H³: XMM9 X_(i-2) : XMM2
129 : H⁴: XMM10 X_(i-3)⊕Y_(i-4): XMM1
130 : Output:
131 : Y_i: XMM1
132 : Inputs XMM0 stays unmodified.
133 : Input must be converted to little-endian.
134 : */
135 0 : asm volatile (/* perform clmul and merge results... */
136 : "pshufd $78, %%xmm10, %%xmm11\n\t"
137 : "pshufd $78, %%xmm1, %%xmm12\n\t"
138 : "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
139 : "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
140 :
141 : "pshufd $78, %%xmm9, %%xmm13\n\t"
142 : "pshufd $78, %%xmm2, %%xmm14\n\t"
143 : "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
144 : "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
145 :
146 : "pshufd $78, %%xmm8, %%xmm5\n\t"
147 : "pshufd $78, %%xmm3, %%xmm15\n\t"
148 : "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
149 : "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
150 :
151 : "movdqa %%xmm10, %%xmm4\n\t"
152 : "movdqa %%xmm9, %%xmm7\n\t"
153 : "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */
154 : "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */
155 : "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
156 : "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */
157 : "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
158 : "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
159 :
160 : "pshufd $78, %%xmm0, %%xmm10\n\t"
161 : "pshufd $78, %%xmm6, %%xmm11\n\t"
162 : "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
163 : "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
164 :
165 : "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */
166 : "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
167 : "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
168 :
169 : "movdqa %%xmm8, %%xmm13\n\t"
170 : "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */
171 : "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */
172 : "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
173 :
174 : "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
175 : "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
176 : "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
177 :
178 : "movdqa %%xmm0, %%xmm3\n\t"
179 : "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */
180 : "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
181 : "movdqa %%xmm11, %%xmm4\n\t"
182 : "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
183 :
184 : "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
185 : "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */
186 : "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
187 :
188 : /* aggregated reduction... */
189 : "movdqa %%xmm3, %%xmm5\n\t"
190 : "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
191 : "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
192 : "movdqa %%xmm4, %%xmm5\n\t"
193 : "psrldq $8, %%xmm4\n\t"
194 : "pslldq $8, %%xmm5\n\t"
195 : "pxor %%xmm5, %%xmm3\n\t"
196 : "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
197 : carry-less multiplication of xmm0
198 : by xmm1 */
199 :
200 : /* shift the result by one bit position to the left cope for
201 : the fact that bits are reversed */
202 : "movdqa %%xmm3, %%xmm4\n\t"
203 : "movdqa %%xmm6, %%xmm5\n\t"
204 : "pslld $1, %%xmm3\n\t"
205 : "pslld $1, %%xmm6\n\t"
206 : "psrld $31, %%xmm4\n\t"
207 : "psrld $31, %%xmm5\n\t"
208 : "movdqa %%xmm4, %%xmm1\n\t"
209 : "pslldq $4, %%xmm5\n\t"
210 : "pslldq $4, %%xmm4\n\t"
211 : "psrldq $12, %%xmm1\n\t"
212 : "por %%xmm4, %%xmm3\n\t"
213 : "por %%xmm5, %%xmm6\n\t"
214 : "por %%xmm6, %%xmm1\n\t"
215 :
216 : /* first phase of the reduction */
217 : "movdqa %%xmm3, %%xmm6\n\t"
218 : "movdqa %%xmm3, %%xmm7\n\t"
219 : "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */
220 : "movdqa %%xmm3, %%xmm5\n\t"
221 : "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */
222 : "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */
223 : "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
224 : "pxor %%xmm5, %%xmm6\n\t"
225 : "movdqa %%xmm6, %%xmm7\n\t"
226 : "pslldq $12, %%xmm6\n\t"
227 : "psrldq $4, %%xmm7\n\t"
228 : "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
229 : complete */
230 :
231 : /* second phase of the reduction */
232 : "movdqa %%xmm3, %%xmm2\n\t"
233 : "movdqa %%xmm3, %%xmm4\n\t"
234 : "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */
235 : "movdqa %%xmm3, %%xmm5\n\t"
236 : "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */
237 : "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */
238 : "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
239 : "pxor %%xmm5, %%xmm2\n\t"
240 : "pxor %%xmm7, %%xmm2\n\t"
241 : "pxor %%xmm2, %%xmm3\n\t"
242 : "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
243 : :::"cc");
244 0 : }
245 : #endif
246 :
247 :
248 : void
249 0 : _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
250 : {
251 : u64 tmp[2];
252 : #if defined(__x86_64__) && defined(__WIN64__)
253 : char win64tmp[3 * 16];
254 :
255 : /* XMM6-XMM8 need to be restored after use. */
256 : asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
257 : "movdqu %%xmm7, 1*16(%0)\n\t"
258 : "movdqu %%xmm8, 2*16(%0)\n\t"
259 : :
260 : : "r" (win64tmp)
261 : : "memory");
262 : #endif
263 :
264 : /* Swap endianness of hsub. */
265 0 : tmp[0] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 8);
266 0 : tmp[1] = buf_get_be64(c->u_mode.gcm.u_ghash_key.key + 0);
267 0 : buf_cpy (c->u_mode.gcm.u_ghash_key.key, tmp, GCRY_GCM_BLOCK_LEN);
268 :
269 : #ifdef __x86_64__
270 0 : asm volatile ("movdqu %[h_1], %%xmm0\n\t"
271 : "movdqa %%xmm0, %%xmm1\n\t"
272 : :
273 : : [h_1] "m" (*tmp));
274 :
275 0 : gfmul_pclmul (); /* H•H => H² */
276 :
277 0 : asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
278 : "movdqa %%xmm1, %%xmm8\n\t"
279 : :
280 : : [h_234] "r" (c->u_mode.gcm.gcm_table)
281 : : "memory");
282 :
283 0 : gfmul_pclmul (); /* H•H² => H³ */
284 :
285 0 : asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
286 : "movdqu %%xmm1, 1*16(%[h_234])\n\t"
287 : "movdqa %%xmm8, %%xmm1\n\t"
288 : :
289 : : [h_234] "r" (c->u_mode.gcm.gcm_table)
290 : : "memory");
291 :
292 0 : gfmul_pclmul (); /* H²•H² => H⁴ */
293 :
294 0 : asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
295 : :
296 : : [h_234] "r" (c->u_mode.gcm.gcm_table)
297 : : "memory");
298 :
299 : #ifdef __WIN64__
300 : /* Clear/restore used registers. */
301 : asm volatile( "pxor %%xmm0, %%xmm0\n\t"
302 : "pxor %%xmm1, %%xmm1\n\t"
303 : "pxor %%xmm2, %%xmm2\n\t"
304 : "pxor %%xmm3, %%xmm3\n\t"
305 : "pxor %%xmm4, %%xmm4\n\t"
306 : "pxor %%xmm5, %%xmm5\n\t"
307 : "movdqu 0*16(%0), %%xmm6\n\t"
308 : "movdqu 1*16(%0), %%xmm7\n\t"
309 : "movdqu 2*16(%0), %%xmm8\n\t"
310 : :
311 : : "r" (win64tmp)
312 : : "memory");
313 : #else
314 : /* Clear used registers. */
315 0 : asm volatile( "pxor %%xmm0, %%xmm0\n\t"
316 : "pxor %%xmm1, %%xmm1\n\t"
317 : "pxor %%xmm2, %%xmm2\n\t"
318 : "pxor %%xmm3, %%xmm3\n\t"
319 : "pxor %%xmm4, %%xmm4\n\t"
320 : "pxor %%xmm5, %%xmm5\n\t"
321 : "pxor %%xmm6, %%xmm6\n\t"
322 : "pxor %%xmm7, %%xmm7\n\t"
323 : "pxor %%xmm8, %%xmm8\n\t"
324 : ::: "cc" );
325 : #endif
326 : #endif
327 :
328 0 : wipememory (tmp, sizeof(tmp));
329 0 : }
330 :
331 :
332 : unsigned int
333 0 : _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
334 : size_t nblocks)
335 : {
336 : static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
337 : { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
338 0 : const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
339 : #ifdef __WIN64__
340 : char win64tmp[10 * 16];
341 : #endif
342 :
343 0 : if (nblocks == 0)
344 0 : return 0;
345 :
346 : #ifdef __WIN64__
347 : /* XMM8-XMM15 need to be restored after use. */
348 : asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
349 : "movdqu %%xmm7, 1*16(%0)\n\t"
350 : "movdqu %%xmm8, 2*16(%0)\n\t"
351 : "movdqu %%xmm9, 3*16(%0)\n\t"
352 : "movdqu %%xmm10, 4*16(%0)\n\t"
353 : "movdqu %%xmm11, 5*16(%0)\n\t"
354 : "movdqu %%xmm12, 6*16(%0)\n\t"
355 : "movdqu %%xmm13, 7*16(%0)\n\t"
356 : "movdqu %%xmm14, 8*16(%0)\n\t"
357 : "movdqu %%xmm15, 9*16(%0)\n\t"
358 : :
359 : : "r" (win64tmp)
360 : : "memory" );
361 : #endif
362 :
363 : /* Preload hash and H1. */
364 0 : asm volatile ("movdqu %[hash], %%xmm1\n\t"
365 : "movdqa %[hsub], %%xmm0\n\t"
366 : "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
367 : :
368 : : [hash] "m" (*result), [be_mask] "m" (*be_mask),
369 : [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
370 :
371 : #ifdef __x86_64__
372 0 : if (nblocks >= 4)
373 : {
374 : do
375 : {
376 0 : asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
377 : "movdqu 0*16(%[buf]), %%xmm5\n\t"
378 : "movdqu 1*16(%[buf]), %%xmm2\n\t"
379 : "movdqu 2*16(%[buf]), %%xmm3\n\t"
380 : "movdqu 3*16(%[buf]), %%xmm6\n\t"
381 : "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
382 :
383 : /* Load H2, H3, H4. */
384 : "movdqu 2*16(%[h_234]), %%xmm10\n\t"
385 : "movdqu 1*16(%[h_234]), %%xmm9\n\t"
386 : "movdqu 0*16(%[h_234]), %%xmm8\n\t"
387 :
388 : "pxor %%xmm5, %%xmm1\n\t"
389 : "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
390 : "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
391 : "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
392 : :
393 : : [buf] "r" (buf), [be_mask] "m" (*be_mask),
394 : [h_234] "r" (c->u_mode.gcm.gcm_table));
395 :
396 0 : gfmul_pclmul_aggr4 ();
397 :
398 0 : buf += 4 * blocksize;
399 0 : nblocks -= 4;
400 : }
401 0 : while (nblocks >= 4);
402 :
403 : #ifndef __WIN64__
404 : /* Clear used x86-64/XMM registers. */
405 0 : asm volatile( "pxor %%xmm8, %%xmm8\n\t"
406 : "pxor %%xmm9, %%xmm9\n\t"
407 : "pxor %%xmm10, %%xmm10\n\t"
408 : "pxor %%xmm11, %%xmm11\n\t"
409 : "pxor %%xmm12, %%xmm12\n\t"
410 : "pxor %%xmm13, %%xmm13\n\t"
411 : "pxor %%xmm14, %%xmm14\n\t"
412 : "pxor %%xmm15, %%xmm15\n\t"
413 : ::: "cc" );
414 : #endif
415 : }
416 : #endif
417 :
418 0 : while (nblocks--)
419 : {
420 0 : asm volatile ("movdqu %[buf], %%xmm2\n\t"
421 : "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
422 : "pxor %%xmm2, %%xmm1\n\t"
423 : :
424 : : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
425 :
426 0 : gfmul_pclmul ();
427 :
428 0 : buf += blocksize;
429 : }
430 :
431 : /* Store hash. */
432 0 : asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
433 : "movdqu %%xmm1, %[hash]\n\t"
434 : : [hash] "=m" (*result)
435 : : [be_mask] "m" (*be_mask));
436 :
437 : #ifdef __WIN64__
438 : /* Clear/restore used registers. */
439 : asm volatile( "pxor %%xmm0, %%xmm0\n\t"
440 : "pxor %%xmm1, %%xmm1\n\t"
441 : "pxor %%xmm2, %%xmm2\n\t"
442 : "pxor %%xmm3, %%xmm3\n\t"
443 : "pxor %%xmm4, %%xmm4\n\t"
444 : "pxor %%xmm5, %%xmm5\n\t"
445 : "movdqu 0*16(%0), %%xmm6\n\t"
446 : "movdqu 1*16(%0), %%xmm7\n\t"
447 : "movdqu 2*16(%0), %%xmm8\n\t"
448 : "movdqu 3*16(%0), %%xmm9\n\t"
449 : "movdqu 4*16(%0), %%xmm10\n\t"
450 : "movdqu 5*16(%0), %%xmm11\n\t"
451 : "movdqu 6*16(%0), %%xmm12\n\t"
452 : "movdqu 7*16(%0), %%xmm13\n\t"
453 : "movdqu 8*16(%0), %%xmm14\n\t"
454 : "movdqu 9*16(%0), %%xmm15\n\t"
455 : :
456 : : "r" (win64tmp)
457 : : "memory" );
458 : #else
459 : /* Clear used registers. */
460 0 : asm volatile( "pxor %%xmm0, %%xmm0\n\t"
461 : "pxor %%xmm1, %%xmm1\n\t"
462 : "pxor %%xmm2, %%xmm2\n\t"
463 : "pxor %%xmm3, %%xmm3\n\t"
464 : "pxor %%xmm4, %%xmm4\n\t"
465 : "pxor %%xmm5, %%xmm5\n\t"
466 : "pxor %%xmm6, %%xmm6\n\t"
467 : "pxor %%xmm7, %%xmm7\n\t"
468 : ::: "cc" );
469 : #endif
470 :
471 0 : return 0;
472 : }
473 :
474 : #endif /* GCM_USE_INTEL_PCLMUL */
|