14 years ago · 0bd82f5f63
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,17 @@
 
															  *            Vinodh Gopal <vinodh.gopal@intel.com>
														
 
															  *            Kahraman Akdemir
														
 
															  *
														
 
															+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
														
 
															+ * interface for 64-bit kernels.
														
 
															+ *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
														
 
															+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
														
 
															+ *             Adrian Hoban <adrian.hoban@intel.com>
														
 
															+ *             James Guilford (james.guilford@intel.com)
														
 
															+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
														
 
															+ *             Tadeusz Struk (tadeusz.struk@intel.com)
														
 
															+ *             Wajdi Feghali (wajdi.k.feghali@intel.com)
														
 
															+ *    Copyright (c) 2010, Intel Corporation.
														
 
															+ *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
 
															  * the Free Software Foundation; either version 2 of the License, or
														
@@ -18,8 +29,60 @@
 
															 #include <linux/linkage.h>
														
 
															 #include <asm/inst.h>
														
 
															+.data
														
 
															+POLY:   .octa 0xC2000000000000000000000000000001
														
 
															+TWOONE: .octa 0x00000001000000000000000000000001
														
 
															+
														
 
															+# order of these constants should not change.
														
 
															+# more specifically, ALL_F should follow SHIFT_MASK,
														
 
															+# and ZERO should follow ALL_F
														
 
															+
														
 
															+SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
														
 
															+MASK1:      .octa 0x0000000000000000ffffffffffffffff
														
 
															+MASK2:      .octa 0xffffffffffffffff0000000000000000
														
 
															+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
														
 
															+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
														
 
															+ZERO:       .octa 0x00000000000000000000000000000000
														
 
															+ONE:        .octa 0x00000000000000000000000000000001
														
 
															+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
														
 
															+dec:        .octa 0x1
														
 
															+enc:        .octa 0x2
														
 
															+
														
 
															+
														
 
															 .text
														
 
															+
														
 
															+#define	STACK_OFFSET    8*3
														
 
															+#define	HashKey		16*0	// store HashKey <<1 mod poly here
														
 
															+#define	HashKey_2	16*1	// store HashKey^2 <<1 mod poly here
														
 
															+#define	HashKey_3	16*2	// store HashKey^3 <<1 mod poly here
														
 
															+#define	HashKey_4	16*3	// store HashKey^4 <<1 mod poly here
														
 
															+#define	HashKey_k	16*4	// store XOR of High 64 bits and Low 64
														
 
															+				// bits of  HashKey <<1 mod poly here
														
 
															+				//(for Karatsuba purposes)
														
 
															+#define	HashKey_2_k	16*5	// store XOR of High 64 bits and Low 64
														
 
															+				// bits of  HashKey^2 <<1 mod poly here
														
 
															+				// (for Karatsuba purposes)
														
 
															+#define	HashKey_3_k	16*6	// store XOR of High 64 bits and Low 64
														
 
															+				// bits of  HashKey^3 <<1 mod poly here
														
 
															+				// (for Karatsuba purposes)
														
 
															+#define	HashKey_4_k	16*7	// store XOR of High 64 bits and Low 64
														
 
															+				// bits of  HashKey^4 <<1 mod poly here
														
 
															+				// (for Karatsuba purposes)
														
 
															+#define	VARIABLE_OFFSET	16*8
														
 
															+
														
 
															+#define arg1 rdi
														
 
															+#define arg2 rsi
														
 
															+#define arg3 rdx
														
 
															+#define arg4 rcx
														
 
															+#define arg5 r8
														
 
															+#define arg6 r9
														
 
															+#define arg7 STACK_OFFSET+8(%r14)
														
 
															+#define arg8 STACK_OFFSET+16(%r14)
														
 
															+#define arg9 STACK_OFFSET+24(%r14)
														
 
															+#define arg10 STACK_OFFSET+32(%r14)
														
 
															+
														
 
															+
														
 
															 #define STATE1	%xmm0
														
 
															 #define STATE2	%xmm4
														
 
															 #define STATE3	%xmm5
														
@@ -47,6 +110,1135 @@
 
															 #define T2	%r11
														
 
															 #define TCTR_LOW T2
														
 
															+
														
 
															+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
														
 
															+*
														
 
															+*
														
 
															+* Input: A and B (128-bits each, bit-reflected)
														
 
															+* Output: C = A*B*x mod poly, (i.e. >>1 )
														
 
															+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
														
 
															+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
														
 
															+*
														
 
															+*/
														
 
															+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
														
 
															+	movdqa	  \GH, \TMP1
														
 
															+	pshufd	  $78, \GH, \TMP2
														
 
															+	pshufd	  $78, \HK, \TMP3
														
 
															+	pxor	  \GH, \TMP2            # TMP2 = a1+a0
														
 
															+	pxor	  \HK, \TMP3            # TMP3 = b1+b0
														
 
															+	PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
														
 
															+	PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
														
 
															+	PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
														
 
															+	pxor	  \GH, \TMP2
														
 
															+	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
														
 
															+	movdqa	  \TMP2, \TMP3
														
 
															+	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
														
 
															+	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
														
 
															+	pxor	  \TMP3, \GH
														
 
															+	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
														
 
															+
														
 
															+        # first phase of the reduction
														
 
															+
														
 
															+	movdqa    \GH, \TMP2
														
 
															+	movdqa    \GH, \TMP3
														
 
															+	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
														
 
															+					# in in order to perform
														
 
															+					# independent shifts
														
 
															+	pslld     $31, \TMP2            # packed right shift <<31
														
 
															+	pslld     $30, \TMP3            # packed right shift <<30
														
 
															+	pslld     $25, \TMP4            # packed right shift <<25
														
 
															+	pxor      \TMP3, \TMP2          # xor the shifted versions
														
 
															+	pxor      \TMP4, \TMP2
														
 
															+	movdqa    \TMP2, \TMP5
														
 
															+	psrldq    $4, \TMP5             # right shift TMP5 1 DW
														
 
															+	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
														
 
															+	pxor      \TMP2, \GH
														
 
															+
														
 
															+        # second phase of the reduction
														
 
															+
														
 
															+	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
														
 
															+					# in in order to perform
														
 
															+					# independent shifts
														
 
															+	movdqa    \GH,\TMP3
														
 
															+	movdqa    \GH,\TMP4
														
 
															+	psrld     $1,\TMP2              # packed left shift >>1
														
 
															+	psrld     $2,\TMP3              # packed left shift >>2
														
 
															+	psrld     $7,\TMP4              # packed left shift >>7
														
 
															+	pxor      \TMP3,\TMP2		# xor the shifted versions
														
 
															+	pxor      \TMP4,\TMP2
														
 
															+	pxor      \TMP5, \TMP2
														
 
															+	pxor      \TMP2, \GH
														
 
															+	pxor      \TMP1, \GH            # result is in TMP1
														
 
															+.endm
														
 
															+
														
 
															+/*
														
 
															+* if a = number of total plaintext bytes
														
 
															+* b = floor(a/16)
														
 
															+* num_initial_blocks = b mod 4
														
 
															+* encrypt the initial num_initial_blocks blocks and apply ghash on
														
 
															+* the ciphertext
														
 
															+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
														
 
															+* are clobbered
														
 
															+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
														
 
															+*/
														
 
															+
														
 
															+.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
														
 
															+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
														
 
															+
														
 
															+	mov	   arg7, %r10           # %r10 = AAD
														
 
															+	mov	   arg8, %r12           # %r12 = aadLen
														
 
															+	mov	   %r12, %r11
														
 
															+	pxor	   %xmm\i, %xmm\i
														
 
															+_get_AAD_loop\num_initial_blocks\operation:
														
 
															+	movd	   (%r10), \TMP1
														
 
															+	pslldq	   $12, \TMP1
														
 
															+	psrldq	   $4, %xmm\i
														
 
															+	pxor	   \TMP1, %xmm\i
														
 
															+	add	   $4, %r10
														
 
															+	sub	   $4, %r12
														
 
															+	jne	   _get_AAD_loop\num_initial_blocks\operation
														
 
															+	cmp	   $16, %r11
														
 
															+	je	   _get_AAD_loop2_done\num_initial_blocks\operation
														
 
															+	mov	   $16, %r12
														
 
															+_get_AAD_loop2\num_initial_blocks\operation:
														
 
															+	psrldq	   $4, %xmm\i
														
 
															+	sub	   $4, %r12
														
 
															+	cmp	   %r11, %r12
														
 
															+	jne	   _get_AAD_loop2\num_initial_blocks\operation
														
 
															+_get_AAD_loop2_done\num_initial_blocks\operation:
														
 
															+	pshufb	   SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
														
 
															+	xor	   %r11, %r11 # initialise the data pointer offset as zero
														
 
															+
														
 
															+        # start AES for num_initial_blocks blocks
														
 
															+
														
 
															+	mov	   %arg5, %rax                      # %rax = *Y0
														
 
															+	movdqu	   (%rax), \XMM0                    # XMM0 = Y0
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM0
														
 
															+.if \i_seq != 0
														
 
															+.irpc index, \i_seq
														
 
															+	paddd	   ONE(%rip), \XMM0                 # INCR Y0
														
 
															+	movdqa	   \XMM0, %xmm\index
														
 
															+	pshufb	   SHUF_MASK(%rip), %xmm\index      # perform a 16 byte swap
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	pxor	   16*0(%arg1), %xmm\index
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x10(%rdi), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 1
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x20(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x30(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x40(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x50(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x60(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x70(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x80(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0x90(%arg1), \TMP1
														
 
															+	AESENC     \TMP1, %xmm\index          # Round 2
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movaps 0xa0(%arg1), \TMP1
														
 
															+	AESENCLAST \TMP1, %xmm\index         # Round 10
														
 
															+.endr
														
 
															+.irpc index, \i_seq
														
 
															+	movdqu	   (%arg3 , %r11, 1), \TMP1
														
 
															+	pxor	   \TMP1, %xmm\index
														
 
															+	movdqu	   %xmm\index, (%arg2 , %r11, 1)
														
 
															+	# write back plaintext/ciphertext for num_initial_blocks
														
 
															+	add	   $16, %r11
														
 
															+.if \operation == dec
														
 
															+	movdqa     \TMP1, %xmm\index
														
 
															+.endif
														
 
															+	pshufb	   SHUF_MASK(%rip), %xmm\index
														
 
															+		# prepare plaintext/ciphertext for GHASH computation
														
 
															+.endr
														
 
															+.endif
														
 
															+	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+        # apply GHASH on num_initial_blocks blocks
														
 
															+
														
 
															+.if \i == 5
														
 
															+        pxor       %xmm5, %xmm6
														
 
															+	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+        pxor       %xmm6, %xmm7
														
 
															+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+        pxor       %xmm7, %xmm8
														
 
															+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+.elseif \i == 6
														
 
															+        pxor       %xmm6, %xmm7
														
 
															+	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+        pxor       %xmm7, %xmm8
														
 
															+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+.elseif \i == 7
														
 
															+        pxor       %xmm7, %xmm8
														
 
															+	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
														
 
															+.endif
														
 
															+	cmp	   $64, %r13
														
 
															+	jl	_initial_blocks_done\num_initial_blocks\operation
														
 
															+	# no need for precomputed values
														
 
															+/*
														
 
															+*
														
 
															+* Precomputations for HashKey parallel with encryption of first 4 blocks.
														
 
															+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
														
 
															+*/
														
 
															+	paddd	   ONE(%rip), \XMM0              # INCR Y0
														
 
															+	movdqa	   \XMM0, \XMM1
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM1        # perform a 16 byte swap
														
 
															+	paddd	   ONE(%rip), \XMM0              # INCR Y0
														
 
															+	movdqa	   \XMM0, \XMM2
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM2        # perform a 16 byte swap
														
 
															+	paddd	   ONE(%rip), \XMM0              # INCR Y0
														
 
															+	movdqa	   \XMM0, \XMM3
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM3        # perform a 16 byte swap
														
 
															+	paddd	   ONE(%rip), \XMM0              # INCR Y0
														
 
															+	movdqa	   \XMM0, \XMM4
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM4        # perform a 16 byte swap
														
 
															+	pxor	   16*0(%arg1), \XMM1
														
 
															+	pxor	   16*0(%arg1), \XMM2
														
 
															+	pxor	   16*0(%arg1), \XMM3
														
 
															+	pxor	   16*0(%arg1), \XMM4
														
 
															+	movdqa	   \TMP3, \TMP5
														
 
															+	pshufd	   $78, \TMP3, \TMP1
														
 
															+	pxor	   \TMP3, \TMP1
														
 
															+	movdqa	   \TMP1, HashKey_k(%rsp)
														
 
															+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
														
 
															+# TMP5 = HashKey^2<<1 (mod poly)
														
 
															+	movdqa	   \TMP5, HashKey_2(%rsp)
														
 
															+# HashKey_2 = HashKey^2<<1 (mod poly)
														
 
															+	pshufd	   $78, \TMP5, \TMP1
														
 
															+	pxor	   \TMP5, \TMP1
														
 
															+	movdqa	   \TMP1, HashKey_2_k(%rsp)
														
 
															+.irpc index, 1234 # do 4 rounds
														
 
															+	movaps 0x10*\index(%arg1), \TMP1
														
 
															+	AESENC	   \TMP1, \XMM1
														
 
															+	AESENC	   \TMP1, \XMM2
														
 
															+	AESENC	   \TMP1, \XMM3
														
 
															+	AESENC	   \TMP1, \XMM4
														
 
															+.endr
														
 
															+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
														
 
															+# TMP5 = HashKey^3<<1 (mod poly)
														
 
															+	movdqa	   \TMP5, HashKey_3(%rsp)
														
 
															+	pshufd	   $78, \TMP5, \TMP1
														
 
															+	pxor	   \TMP5, \TMP1
														
 
															+	movdqa	   \TMP1, HashKey_3_k(%rsp)
														
 
															+.irpc index, 56789 # do next 5 rounds
														
 
															+	movaps 0x10*\index(%arg1), \TMP1
														
 
															+	AESENC	   \TMP1, \XMM1
														
 
															+	AESENC	   \TMP1, \XMM2
														
 
															+	AESENC	   \TMP1, \XMM3
														
 
															+	AESENC	   \TMP1, \XMM4
														
 
															+.endr
														
 
															+	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
														
 
															+# TMP5 = HashKey^3<<1 (mod poly)
														
 
															+	movdqa	   \TMP5, HashKey_4(%rsp)
														
 
															+	pshufd	   $78, \TMP5, \TMP1
														
 
															+	pxor	   \TMP5, \TMP1
														
 
															+	movdqa	   \TMP1, HashKey_4_k(%rsp)
														
 
															+	movaps 0xa0(%arg1), \TMP2
														
 
															+	AESENCLAST \TMP2, \XMM1
														
 
															+	AESENCLAST \TMP2, \XMM2
														
 
															+	AESENCLAST \TMP2, \XMM3
														
 
															+	AESENCLAST \TMP2, \XMM4
														
 
															+	movdqu	   16*0(%arg3 , %r11 , 1), \TMP1
														
 
															+	pxor	   \TMP1, \XMM1
														
 
															+.if \operation == dec
														
 
															+	movdqu	   \XMM1, 16*0(%arg2 , %r11 , 1)
														
 
															+	movdqa     \TMP1, \XMM1
														
 
															+.endif
														
 
															+	movdqu	   16*1(%arg3 , %r11 , 1), \TMP1
														
 
															+	pxor	   \TMP1, \XMM2
														
 
															+.if \operation == dec
														
 
															+	movdqu	   \XMM2, 16*1(%arg2 , %r11 , 1)
														
 
															+	movdqa     \TMP1, \XMM2
														
 
															+.endif
														
 
															+	movdqu	   16*2(%arg3 , %r11 , 1), \TMP1
														
 
															+	pxor	   \TMP1, \XMM3
														
 
															+.if \operation == dec
														
 
															+	movdqu	   \XMM3, 16*2(%arg2 , %r11 , 1)
														
 
															+	movdqa     \TMP1, \XMM3
														
 
															+.endif
														
 
															+	movdqu	   16*3(%arg3 , %r11 , 1), \TMP1
														
 
															+	pxor	   \TMP1, \XMM4
														
 
															+.if \operation == dec
														
 
															+	movdqu	   \XMM4, 16*3(%arg2 , %r11 , 1)
														
 
															+	movdqa     \TMP1, \XMM4
														
 
															+.else
														
 
															+	movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
														
 
															+	movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
														
 
															+	movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
														
 
															+	movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
														
 
															+.endif
														
 
															+	add	   $64, %r11
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
														
 
															+	pxor	   \XMMDst, \XMM1
														
 
															+# combine GHASHed value with the corresponding ciphertext
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
														
 
															+	pshufb	   SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
														
 
															+_initial_blocks_done\num_initial_blocks\operation:
														
 
															+.endm
														
 
															+
														
 
															+/*
														
 
															+* encrypt 4 blocks at a time
														
 
															+* ghash the 4 previously encrypted ciphertext blocks
														
 
															+* arg1, %arg2, %arg3 are used as pointers only, not modified
														
 
															+* %r11 is the data offset value
														
 
															+*/
														
 
															+.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
														
 
															+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
														
 
															+
														
 
															+	movdqa	  \XMM1, \XMM5
														
 
															+	movdqa	  \XMM2, \XMM6
														
 
															+	movdqa	  \XMM3, \XMM7
														
 
															+	movdqa	  \XMM4, \XMM8
														
 
															+
														
 
															+        # multiply TMP5 * HashKey using karatsuba
														
 
															+
														
 
															+	movdqa	  \XMM5, \TMP4
														
 
															+	pshufd	  $78, \XMM5, \TMP6
														
 
															+	pxor	  \XMM5, \TMP6
														
 
															+	paddd     ONE(%rip), \XMM0		# INCR CNT
														
 
															+	movdqa	  HashKey_4(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
														
 
															+	movdqa    \XMM0, \XMM1
														
 
															+	paddd     ONE(%rip), \XMM0		# INCR CNT
														
 
															+	movdqa    \XMM0, \XMM2
														
 
															+	paddd     ONE(%rip), \XMM0		# INCR CNT
														
 
															+	movdqa    \XMM0, \XMM3
														
 
															+	paddd     ONE(%rip), \XMM0		# INCR CNT
														
 
															+	movdqa    \XMM0, \XMM4
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM1	# perform a 16 byte swap
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM2	# perform a 16 byte swap
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM3	# perform a 16 byte swap
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM4	# perform a 16 byte swap
														
 
															+	pxor	  (%arg1), \XMM1
														
 
															+	pxor	  (%arg1), \XMM2
														
 
															+	pxor	  (%arg1), \XMM3
														
 
															+	pxor	  (%arg1), \XMM4
														
 
															+	movdqa	  HashKey_4_k(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
														
 
															+	movaps 0x10(%arg1), \TMP1
														
 
															+	AESENC	  \TMP1, \XMM1              # Round 1
														
 
															+	AESENC	  \TMP1, \XMM2
														
 
															+	AESENC	  \TMP1, \XMM3
														
 
															+	AESENC	  \TMP1, \XMM4
														
 
															+	movaps 0x20(%arg1), \TMP1
														
 
															+	AESENC	  \TMP1, \XMM1              # Round 2
														
 
															+	AESENC	  \TMP1, \XMM2
														
 
															+	AESENC	  \TMP1, \XMM3
														
 
															+	AESENC	  \TMP1, \XMM4
														
 
															+	movdqa	  \XMM6, \TMP1
														
 
															+	pshufd	  $78, \XMM6, \TMP2
														
 
															+	pxor	  \XMM6, \TMP2
														
 
															+	movdqa	  HashKey_3(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
														
 
															+	movaps 0x30(%arg1), \TMP3
														
 
															+	AESENC    \TMP3, \XMM1              # Round 3
														
 
															+	AESENC    \TMP3, \XMM2
														
 
															+	AESENC    \TMP3, \XMM3
														
 
															+	AESENC    \TMP3, \XMM4
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
														
 
															+	movaps 0x40(%arg1), \TMP3
														
 
															+	AESENC	  \TMP3, \XMM1              # Round 4
														
 
															+	AESENC	  \TMP3, \XMM2
														
 
															+	AESENC	  \TMP3, \XMM3
														
 
															+	AESENC	  \TMP3, \XMM4
														
 
															+	movdqa	  HashKey_3_k(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	movaps 0x50(%arg1), \TMP3
														
 
															+	AESENC	  \TMP3, \XMM1              # Round 5
														
 
															+	AESENC	  \TMP3, \XMM2
														
 
															+	AESENC	  \TMP3, \XMM3
														
 
															+	AESENC	  \TMP3, \XMM4
														
 
															+	pxor	  \TMP1, \TMP4
														
 
															+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
														
 
															+	pxor	  \XMM6, \XMM5
														
 
															+	pxor	  \TMP2, \TMP6
														
 
															+	movdqa	  \XMM7, \TMP1
														
 
															+	pshufd	  $78, \XMM7, \TMP2
														
 
															+	pxor	  \XMM7, \TMP2
														
 
															+	movdqa	  HashKey_2(%rsp ), \TMP5
														
 
															+
														
 
															+        # Multiply TMP5 * HashKey using karatsuba
														
 
															+
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
														
 
															+	movaps 0x60(%arg1), \TMP3
														
 
															+	AESENC	  \TMP3, \XMM1              # Round 6
														
 
															+	AESENC	  \TMP3, \XMM2
														
 
															+	AESENC	  \TMP3, \XMM3
														
 
															+	AESENC	  \TMP3, \XMM4
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
														
 
															+	movaps 0x70(%arg1), \TMP3
														
 
															+	AESENC	  \TMP3, \XMM1             # Round 7
														
 
															+	AESENC	  \TMP3, \XMM2
														
 
															+	AESENC	  \TMP3, \XMM3
														
 
															+	AESENC	  \TMP3, \XMM4
														
 
															+	movdqa	  HashKey_2_k(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	movaps 0x80(%arg1), \TMP3
														
 
															+	AESENC	  \TMP3, \XMM1             # Round 8
														
 
															+	AESENC	  \TMP3, \XMM2
														
 
															+	AESENC	  \TMP3, \XMM3
														
 
															+	AESENC	  \TMP3, \XMM4
														
 
															+	pxor	  \TMP1, \TMP4
														
 
															+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
														
 
															+	pxor	  \XMM7, \XMM5
														
 
															+	pxor	  \TMP2, \TMP6
														
 
															+
														
 
															+        # Multiply XMM8 * HashKey
														
 
															+        # XMM8 and TMP5 hold the values for the two operands
														
 
															+
														
 
															+	movdqa	  \XMM8, \TMP1
														
 
															+	pshufd	  $78, \XMM8, \TMP2
														
 
															+	pxor	  \XMM8, \TMP2
														
 
															+	movdqa	  HashKey(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
														
 
															+	movaps 0x90(%arg1), \TMP3
														
 
															+	AESENC	  \TMP3, \XMM1            # Round 9
														
 
															+	AESENC	  \TMP3, \XMM2
														
 
															+	AESENC	  \TMP3, \XMM3
														
 
															+	AESENC	  \TMP3, \XMM4
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
														
 
															+	movaps 0xa0(%arg1), \TMP3
														
 
															+	AESENCLAST \TMP3, \XMM1           # Round 10
														
 
															+	AESENCLAST \TMP3, \XMM2
														
 
															+	AESENCLAST \TMP3, \XMM3
														
 
															+	AESENCLAST \TMP3, \XMM4
														
 
															+	movdqa    HashKey_k(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	movdqu	  (%arg3,%r11,1), \TMP3
														
 
															+	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
														
 
															+.if \operation == dec
														
 
															+	movdqu	  \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
														
 
															+	movdqa    \TMP3, \XMM1
														
 
															+.endif
														
 
															+	movdqu	  16(%arg3,%r11,1), \TMP3
														
 
															+	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
														
 
															+.if \operation == dec
														
 
															+	movdqu	  \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
														
 
															+	movdqa    \TMP3, \XMM2
														
 
															+.endif
														
 
															+	movdqu	  32(%arg3,%r11,1), \TMP3
														
 
															+	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
														
 
															+.if \operation == dec
														
 
															+	movdqu	  \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
														
 
															+	movdqa    \TMP3, \XMM3
														
 
															+.endif
														
 
															+	movdqu	  48(%arg3,%r11,1), \TMP3
														
 
															+	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
														
 
															+.if \operation == dec
														
 
															+	movdqu	  \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
														
 
															+	movdqa    \TMP3, \XMM4
														
 
															+.else
														
 
															+    movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
														
 
															+    movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
														
 
															+    movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
														
 
															+    movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
														
 
															+.endif
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM1       # perform a 16 byte swap
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM2       # perform a 16 byte swap
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM3       # perform a 16 byte swap
														
 
															+	pshufb	  SHUF_MASK(%rip), \XMM4       # perform a 16 byte sway
														
 
															+
														
 
															+	pxor	  \TMP4, \TMP1
														
 
															+	pxor	  \XMM8, \XMM5
														
 
															+	pxor	  \TMP6, \TMP2
														
 
															+	pxor	  \TMP1, \TMP2
														
 
															+	pxor	  \XMM5, \TMP2
														
 
															+	movdqa	  \TMP2, \TMP3
														
 
															+	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
														
 
															+	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
														
 
															+	pxor	  \TMP3, \XMM5
														
 
															+	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
														
 
															+
														
 
															+        # first phase of reduction
														
 
															+
														
 
															+	movdqa    \XMM5, \TMP2
														
 
															+	movdqa    \XMM5, \TMP3
														
 
															+	movdqa    \XMM5, \TMP4
														
 
															+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
														
 
															+	pslld     $31, \TMP2                   # packed right shift << 31
														
 
															+	pslld     $30, \TMP3                   # packed right shift << 30
														
 
															+	pslld     $25, \TMP4                   # packed right shift << 25
														
 
															+	pxor      \TMP3, \TMP2	               # xor the shifted versions
														
 
															+	pxor      \TMP4, \TMP2
														
 
															+	movdqa    \TMP2, \TMP5
														
 
															+	psrldq    $4, \TMP5                    # right shift T5 1 DW
														
 
															+	pslldq    $12, \TMP2                   # left shift T2 3 DWs
														
 
															+	pxor      \TMP2, \XMM5
														
 
															+
														
 
															+        # second phase of reduction
														
 
															+
														
 
															+	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
														
 
															+	movdqa    \XMM5,\TMP3
														
 
															+	movdqa    \XMM5,\TMP4
														
 
															+	psrld     $1, \TMP2                    # packed left shift >>1
														
 
															+	psrld     $2, \TMP3                    # packed left shift >>2
														
 
															+	psrld     $7, \TMP4                    # packed left shift >>7
														
 
															+	pxor      \TMP3,\TMP2		       # xor the shifted versions
														
 
															+	pxor      \TMP4,\TMP2
														
 
															+	pxor      \TMP5, \TMP2
														
 
															+	pxor      \TMP2, \XMM5
														
 
															+	pxor      \TMP1, \XMM5                 # result is in TMP1
														
 
															+
														
 
															+	pxor	  \XMM5, \XMM1
														
 
															+.endm
														
 
															+
														
 
															+/* GHASH the last 4 ciphertext blocks. */
														
 
															+.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
														
 
															+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
														
 
															+
														
 
															+        # Multiply TMP6 * HashKey (using Karatsuba)
														
 
															+
														
 
															+	movdqa	  \XMM1, \TMP6
														
 
															+	pshufd	  $78, \XMM1, \TMP2
														
 
															+	pxor	  \XMM1, \TMP2
														
 
															+	movdqa	  HashKey_4(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
														
 
															+	movdqa	  HashKey_4_k(%rsp), \TMP4
														
 
															+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	movdqa	  \XMM1, \XMMDst
														
 
															+	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
														
 
															+
														
 
															+        # Multiply TMP1 * HashKey (using Karatsuba)
														
 
															+
														
 
															+	movdqa	  \XMM2, \TMP1
														
 
															+	pshufd	  $78, \XMM2, \TMP2
														
 
															+	pxor	  \XMM2, \TMP2
														
 
															+	movdqa	  HashKey_3(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
														
 
															+	movdqa	  HashKey_3_k(%rsp), \TMP4
														
 
															+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	pxor	  \TMP1, \TMP6
														
 
															+	pxor	  \XMM2, \XMMDst
														
 
															+	pxor	  \TMP2, \XMM1
														
 
															+# results accumulated in TMP6, XMMDst, XMM1
														
 
															+
														
 
															+        # Multiply TMP1 * HashKey (using Karatsuba)
														
 
															+
														
 
															+	movdqa	  \XMM3, \TMP1
														
 
															+	pshufd	  $78, \XMM3, \TMP2
														
 
															+	pxor	  \XMM3, \TMP2
														
 
															+	movdqa	  HashKey_2(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
														
 
															+	movdqa	  HashKey_2_k(%rsp), \TMP4
														
 
															+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	pxor	  \TMP1, \TMP6
														
 
															+	pxor	  \XMM3, \XMMDst
														
 
															+	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
														
 
															+
														
 
															+        # Multiply TMP1 * HashKey (using Karatsuba)
														
 
															+	movdqa	  \XMM4, \TMP1
														
 
															+	pshufd	  $78, \XMM4, \TMP2
														
 
															+	pxor	  \XMM4, \TMP2
														
 
															+	movdqa	  HashKey(%rsp), \TMP5
														
 
															+	PCLMULQDQ 0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
														
 
															+	PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
														
 
															+	movdqa	  HashKey_k(%rsp), \TMP4
														
 
															+	PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
														
 
															+	pxor	  \TMP1, \TMP6
														
 
															+	pxor	  \XMM4, \XMMDst
														
 
															+	pxor	  \XMM1, \TMP2
														
 
															+	pxor	  \TMP6, \TMP2
														
 
															+	pxor	  \XMMDst, \TMP2
														
 
															+	# middle section of the temp results combined as in karatsuba algorithm
														
 
															+	movdqa	  \TMP2, \TMP4
														
 
															+	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
														
 
															+	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
														
 
															+	pxor	  \TMP4, \XMMDst
														
 
															+	pxor	  \TMP2, \TMP6
														
 
															+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
														
 
															+	# first phase of the reduction
														
 
															+	movdqa    \XMMDst, \TMP2
														
 
															+	movdqa    \XMMDst, \TMP3
														
 
															+	movdqa    \XMMDst, \TMP4
														
 
															+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
														
 
															+	pslld     $31, \TMP2                # packed right shifting << 31
														
 
															+	pslld     $30, \TMP3                # packed right shifting << 30
														
 
															+	pslld     $25, \TMP4                # packed right shifting << 25
														
 
															+	pxor      \TMP3, \TMP2              # xor the shifted versions
														
 
															+	pxor      \TMP4, \TMP2
														
 
															+	movdqa    \TMP2, \TMP7
														
 
															+	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
														
 
															+	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
														
 
															+	pxor      \TMP2, \XMMDst
														
 
															+
														
 
															+        # second phase of the reduction
														
 
															+	movdqa    \XMMDst, \TMP2
														
 
															+	# make 3 copies of XMMDst for doing 3 shift operations
														
 
															+	movdqa    \XMMDst, \TMP3
														
 
															+	movdqa    \XMMDst, \TMP4
														
 
															+	psrld     $1, \TMP2                 # packed left shift >> 1
														
 
															+	psrld     $2, \TMP3                 # packed left shift >> 2
														
 
															+	psrld     $7, \TMP4                 # packed left shift >> 7
														
 
															+	pxor      \TMP3, \TMP2              # xor the shifted versions
														
 
															+	pxor      \TMP4, \TMP2
														
 
															+	pxor      \TMP7, \TMP2
														
 
															+	pxor      \TMP2, \XMMDst
														
 
															+	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
														
 
															+.endm
														
 
															+
														
 
															+/* Encryption of a single block done*/
														
 
															+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
														
 
															+
														
 
															+	pxor	(%arg1), \XMM0
														
 
															+        movaps 16(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 32(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 48(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 64(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 80(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 96(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 112(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 128(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 144(%arg1), \TMP1
														
 
															+	AESENC	\TMP1, \XMM0
														
 
															+        movaps 160(%arg1), \TMP1
														
 
															+	AESENCLAST	\TMP1, \XMM0
														
 
															+.endm
														
 
															+
														
 
															+
														
 
															+/*****************************************************************************
														
 
															+* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
														
 
															+*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
														
 
															+*                   const u8 *in,      // Ciphertext input
														
 
															+*                   u64 plaintext_len, // Length of data in bytes for decryption.
														
 
															+*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
														
 
															+*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
														
 
															+*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
														
 
															+*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
														
 
															+*                   const u8 *aad,     // Additional Authentication Data (AAD)
														
 
															+*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
														
 
															+*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
														
 
															+*                                      // given authentication tag and only return the plaintext if they match.
														
 
															+*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
														
 
															+*                                      // (most likely), 12 or 8.
														
 
															+*
														
 
															+* Assumptions:
														
 
															+*
														
 
															+* keys:
														
 
															+*       keys are pre-expanded and aligned to 16 bytes. we are using the first
														
 
															+*       set of 11 keys in the data structure void *aes_ctx
														
 
															+*
														
 
															+* iv:
														
 
															+*       0                   1                   2                   3
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                             Salt  (From the SA)               |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                     Initialization Vector                     |
														
 
															+*       |         (This is the sequence number from IPSec header)       |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                              0x1                              |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*
														
 
															+*
														
 
															+*
														
 
															+* AAD:
														
 
															+*       AAD padded to 128 bits with 0
														
 
															+*       for example, assume AAD is a u32 vector
														
 
															+*
														
 
															+*       if AAD is 8 bytes:
														
 
															+*       AAD[3] = {A0, A1};
														
 
															+*       padded AAD in xmm register = {A1 A0 0 0}
														
 
															+*
														
 
															+*       0                   1                   2                   3
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                               SPI (A1)                        |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                     32-bit Sequence Number (A0)               |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                              0x0                              |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*
														
 
															+*                                       AAD Format with 32-bit Sequence Number
														
 
															+*
														
 
															+*       if AAD is 12 bytes:
														
 
															+*       AAD[3] = {A0, A1, A2};
														
 
															+*       padded AAD in xmm register = {A2 A1 A0 0}
														
 
															+*
														
 
															+*       0                   1                   2                   3
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                               SPI (A2)                        |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                 64-bit Extended Sequence Number {A1,A0}       |
														
 
															+*       |                                                               |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                              0x0                              |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*
														
 
															+*                        AAD Format with 64-bit Extended Sequence Number
														
 
															+*
														
 
															+* aadLen:
														
 
															+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
														
 
															+*       The code supports 16 too but for other sizes, the code will fail.
														
 
															+*
														
 
															+* TLen:
														
 
															+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
														
 
															+*       For other sizes, the code will fail.
														
 
															+*
														
 
															+* poly = x^128 + x^127 + x^126 + x^121 + 1
														
 
															+*
														
 
															+*****************************************************************************/
														
 
															+
														
 
															+ENTRY(aesni_gcm_dec)
														
 
															+	push	%r12
														
 
															+	push	%r13
														
 
															+	push	%r14
														
 
															+	mov	%rsp, %r14
														
 
															+/*
														
 
															+* states of %xmm registers %xmm6:%xmm15 not saved
														
 
															+* all %xmm registers are clobbered
														
 
															+*/
														
 
															+	sub	$VARIABLE_OFFSET, %rsp
														
 
															+	and	$~63, %rsp                        # align rsp to 64 bytes
														
 
															+	mov	%arg6, %r12
														
 
															+	movdqu	(%r12), %xmm13			  # %xmm13 = HashKey
														
 
															+	pshufb	SHUF_MASK(%rip), %xmm13
														
 
															+
														
 
															+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
														
 
															+
														
 
															+	movdqa	%xmm13, %xmm2
														
 
															+	psllq	$1, %xmm13
														
 
															+	psrlq	$63, %xmm2
														
 
															+	movdqa	%xmm2, %xmm1
														
 
															+	pslldq	$8, %xmm2
														
 
															+	psrldq	$8, %xmm1
														
 
															+	por	%xmm2, %xmm13
														
 
															+
														
 
															+        # Reduction
														
 
															+
														
 
															+	pshufd	$0x24, %xmm1, %xmm2
														
 
															+	pcmpeqd TWOONE(%rip), %xmm2
														
 
															+	pand	POLY(%rip), %xmm2
														
 
															+	pxor	%xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
														
 
															+
														
 
															+
														
 
															+        # Decrypt first few blocks
														
 
															+
														
 
															+	movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
														
 
															+	mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
														
 
															+	and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
														
 
															+	mov %r13, %r12
														
 
															+	and $(3<<4), %r12
														
 
															+	jz _initial_num_blocks_is_0_decrypt
														
 
															+	cmp $(2<<4), %r12
														
 
															+	jb _initial_num_blocks_is_1_decrypt
														
 
															+	je _initial_num_blocks_is_2_decrypt
														
 
															+_initial_num_blocks_is_3_decrypt:
														
 
															+	INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
														
 
															+	sub	$48, %r13
														
 
															+	jmp	_initial_blocks_decrypted
														
 
															+_initial_num_blocks_is_2_decrypt:
														
 
															+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
														
 
															+	sub	$32, %r13
														
 
															+	jmp	_initial_blocks_decrypted
														
 
															+_initial_num_blocks_is_1_decrypt:
														
 
															+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
														
 
															+	sub	$16, %r13
														
 
															+	jmp	_initial_blocks_decrypted
														
 
															+_initial_num_blocks_is_0_decrypt:
														
 
															+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
														
 
															+_initial_blocks_decrypted:
														
 
															+	cmp	$0, %r13
														
 
															+	je	_zero_cipher_left_decrypt
														
 
															+	sub	$64, %r13
														
 
															+	je	_four_cipher_left_decrypt
														
 
															+_decrypt_by_4:
														
 
															+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
														
 
															+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
														
 
															+	add	$64, %r11
														
 
															+	sub	$64, %r13
														
 
															+	jne	_decrypt_by_4
														
 
															+_four_cipher_left_decrypt:
														
 
															+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
														
 
															+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
														
 
															+_zero_cipher_left_decrypt:
														
 
															+	mov	%arg4, %r13
														
 
															+	and	$15, %r13				# %r13 = arg4 (mod 16)
														
 
															+	je	_multiple_of_16_bytes_decrypt
														
 
															+
														
 
															+        # Handle the last <16 byte block seperately
														
 
															+
														
 
															+	paddd ONE(%rip), %xmm0         # increment CNT to get Yn
														
 
															+	pshufb SHUF_MASK(%rip), %xmm0
														
 
															+	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
														
 
															+	sub $16, %r11
														
 
															+	add %r13, %r11
														
 
															+	movdqu (%arg3,%r11,1), %xmm1   # recieve the last <16 byte block
														
 
															+	lea SHIFT_MASK+16(%rip), %r12
														
 
															+	sub %r13, %r12
														
 
															+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
														
 
															+# (%r13 is the number of bytes in plaintext mod 16)
														
 
															+	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
														
 
															+	pshufb %xmm2, %xmm1            # right shift 16-%r13 butes
														
 
															+	movdqa  %xmm1, %xmm2
														
 
															+	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
														
 
															+	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
														
 
															+	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
														
 
															+	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
														
 
															+	pand    %xmm1, %xmm2
														
 
															+	pshufb SHUF_MASK(%rip),%xmm2
														
 
															+	pxor %xmm2, %xmm8
														
 
															+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
														
 
															+	          # GHASH computation for the last <16 byte block
														
 
															+	sub %r13, %r11
														
 
															+	add $16, %r11
														
 
															+
														
 
															+        # output %r13 bytes
														
 
															+	movq	%xmm0, %rax
														
 
															+	cmp	$8, %r13
														
 
															+	jle	_less_than_8_bytes_left_decrypt
														
 
															+	mov	%rax, (%arg2 , %r11, 1)
														
 
															+	add	$8, %r11
														
 
															+	psrldq	$8, %xmm0
														
 
															+	movq	%xmm0, %rax
														
 
															+	sub	$8, %r13
														
 
															+_less_than_8_bytes_left_decrypt:
														
 
															+	mov	%al,  (%arg2, %r11, 1)
														
 
															+	add	$1, %r11
														
 
															+	shr	$8, %rax
														
 
															+	sub	$1, %r13
														
 
															+	jne	_less_than_8_bytes_left_decrypt
														
 
															+_multiple_of_16_bytes_decrypt:
														
 
															+	mov	arg8, %r12		  # %r13 = aadLen (number of bytes)
														
 
															+	shl	$3, %r12		  # convert into number of bits
														
 
															+	movd	%r12d, %xmm15		  # len(A) in %xmm15
														
 
															+	shl	$3, %arg4		  # len(C) in bits (*128)
														
 
															+	movq	%arg4, %xmm1
														
 
															+	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
														
 
															+	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
														
 
															+	pxor	%xmm15, %xmm8
														
 
															+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
														
 
															+	         # final GHASH computation
														
 
															+	pshufb	SHUF_MASK(%rip), %xmm8
														
 
															+	mov	%arg5, %rax		  # %rax = *Y0
														
 
															+	movdqu	(%rax), %xmm0		  # %xmm0 = Y0
														
 
															+	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
														
 
															+	pxor	%xmm8, %xmm0
														
 
															+_return_T_decrypt:
														
 
															+	mov	arg9, %r10                # %r10 = authTag
														
 
															+	mov	arg10, %r11               # %r11 = auth_tag_len
														
 
															+	cmp	$16, %r11
														
 
															+	je	_T_16_decrypt
														
 
															+	cmp	$12, %r11
														
 
															+	je	_T_12_decrypt
														
 
															+_T_8_decrypt:
														
 
															+	movq	%xmm0, %rax
														
 
															+	mov	%rax, (%r10)
														
 
															+	jmp	_return_T_done_decrypt
														
 
															+_T_12_decrypt:
														
 
															+	movq	%xmm0, %rax
														
 
															+	mov	%rax, (%r10)
														
 
															+	psrldq	$8, %xmm0
														
 
															+	movd	%xmm0, %eax
														
 
															+	mov	%eax, 8(%r10)
														
 
															+	jmp	_return_T_done_decrypt
														
 
															+_T_16_decrypt:
														
 
															+	movdqu	%xmm0, (%r10)
														
 
															+_return_T_done_decrypt:
														
 
															+	mov	%r14, %rsp
														
 
															+	pop	%r14
														
 
															+	pop	%r13
														
 
															+	pop	%r12
														
 
															+	ret
														
 
															+
														
 
															+
														
 
															+/*****************************************************************************
														
 
															+* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
														
 
															+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
														
 
															+*                    const u8 *in,       // Plaintext input
														
 
															+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
														
 
															+*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
														
 
															+*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
														
 
															+*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
														
 
															+*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
														
 
															+*                    const u8 *aad,      // Additional Authentication Data (AAD)
														
 
															+*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
														
 
															+*                    u8 *auth_tag,       // Authenticated Tag output.
														
 
															+*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
														
 
															+*                                        // 12 or 8.
														
 
															+*
														
 
															+* Assumptions:
														
 
															+*
														
 
															+* keys:
														
 
															+*       keys are pre-expanded and aligned to 16 bytes. we are using the
														
 
															+*       first set of 11 keys in the data structure void *aes_ctx
														
 
															+*
														
 
															+*
														
 
															+* iv:
														
 
															+*       0                   1                   2                   3
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                             Salt  (From the SA)               |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                     Initialization Vector                     |
														
 
															+*       |         (This is the sequence number from IPSec header)       |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                              0x1                              |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*
														
 
															+*
														
 
															+*
														
 
															+* AAD:
														
 
															+*       AAD padded to 128 bits with 0
														
 
															+*       for example, assume AAD is a u32 vector
														
 
															+*
														
 
															+*       if AAD is 8 bytes:
														
 
															+*       AAD[3] = {A0, A1};
														
 
															+*       padded AAD in xmm register = {A1 A0 0 0}
														
 
															+*
														
 
															+*       0                   1                   2                   3
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                               SPI (A1)                        |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                     32-bit Sequence Number (A0)               |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                              0x0                              |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*
														
 
															+*                                 AAD Format with 32-bit Sequence Number
														
 
															+*
														
 
															+*       if AAD is 12 bytes:
														
 
															+*       AAD[3] = {A0, A1, A2};
														
 
															+*       padded AAD in xmm register = {A2 A1 A0 0}
														
 
															+*
														
 
															+*       0                   1                   2                   3
														
 
															+*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                               SPI (A2)                        |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                 64-bit Extended Sequence Number {A1,A0}       |
														
 
															+*       |                                                               |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*       |                              0x0                              |
														
 
															+*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
														
 
															+*
														
 
															+*                         AAD Format with 64-bit Extended Sequence Number
														
 
															+*
														
 
															+* aadLen:
														
 
															+*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
														
 
															+*       The code supports 16 too but for other sizes, the code will fail.
														
 
															+*
														
 
															+* TLen:
														
 
															+*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
														
 
															+*       For other sizes, the code will fail.
														
 
															+*
														
 
															+* poly = x^128 + x^127 + x^126 + x^121 + 1
														
 
															+***************************************************************************/
														
 
															+ENTRY(aesni_gcm_enc)
														
 
															+	push	%r12
														
 
															+	push	%r13
														
 
															+	push	%r14
														
 
															+	mov	%rsp, %r14
														
 
															+#
														
 
															+# states of %xmm registers %xmm6:%xmm15 not saved
														
 
															+# all %xmm registers are clobbered
														
 
															+#
														
 
															+	sub	$VARIABLE_OFFSET, %rsp
														
 
															+	and	$~63, %rsp
														
 
															+	mov	%arg6, %r12
														
 
															+	movdqu	(%r12), %xmm13
														
 
															+	pshufb	SHUF_MASK(%rip), %xmm13
														
 
															+
														
 
															+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
														
 
															+
														
 
															+	movdqa	%xmm13, %xmm2
														
 
															+	psllq	$1, %xmm13
														
 
															+	psrlq	$63, %xmm2
														
 
															+	movdqa	%xmm2, %xmm1
														
 
															+	pslldq	$8, %xmm2
														
 
															+	psrldq	$8, %xmm1
														
 
															+	por	%xmm2, %xmm13
														
 
															+
														
 
															+        # reduce HashKey<<1
														
 
															+
														
 
															+	pshufd	$0x24, %xmm1, %xmm2
														
 
															+	pcmpeqd TWOONE(%rip), %xmm2
														
 
															+	pand	POLY(%rip), %xmm2
														
 
															+	pxor	%xmm2, %xmm13
														
 
															+	movdqa	%xmm13, HashKey(%rsp)
														
 
															+	mov	%arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
														
 
															+	and	$-16, %r13
														
 
															+	mov	%r13, %r12
														
 
															+
														
 
															+        # Encrypt first few blocks
														
 
															+
														
 
															+	and	$(3<<4), %r12
														
 
															+	jz	_initial_num_blocks_is_0_encrypt
														
 
															+	cmp	$(2<<4), %r12
														
 
															+	jb	_initial_num_blocks_is_1_encrypt
														
 
															+	je	_initial_num_blocks_is_2_encrypt
														
 
															+_initial_num_blocks_is_3_encrypt:
														
 
															+	INITIAL_BLOCKS	3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
														
 
															+	sub	$48, %r13
														
 
															+	jmp	_initial_blocks_encrypted
														
 
															+_initial_num_blocks_is_2_encrypt:
														
 
															+	INITIAL_BLOCKS	2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
														
 
															+	sub	$32, %r13
														
 
															+	jmp	_initial_blocks_encrypted
														
 
															+_initial_num_blocks_is_1_encrypt:
														
 
															+	INITIAL_BLOCKS	1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
														
 
															+	sub	$16, %r13
														
 
															+	jmp	_initial_blocks_encrypted
														
 
															+_initial_num_blocks_is_0_encrypt:
														
 
															+	INITIAL_BLOCKS	0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
														
 
															+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
														
 
															+_initial_blocks_encrypted:
														
 
															+
														
 
															+        # Main loop - Encrypt remaining blocks
														
 
															+
														
 
															+	cmp	$0, %r13
														
 
															+	je	_zero_cipher_left_encrypt
														
 
															+	sub	$64, %r13
														
 
															+	je	_four_cipher_left_encrypt
														
 
															+_encrypt_by_4_encrypt:
														
 
															+	GHASH_4_ENCRYPT_4_PARALLEL	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
														
 
															+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
														
 
															+	add	$64, %r11
														
 
															+	sub	$64, %r13
														
 
															+	jne	_encrypt_by_4_encrypt
														
 
															+_four_cipher_left_encrypt:
														
 
															+	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
														
 
															+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
														
 
															+_zero_cipher_left_encrypt:
														
 
															+	mov	%arg4, %r13
														
 
															+	and	$15, %r13			# %r13 = arg4 (mod 16)
														
 
															+	je	_multiple_of_16_bytes_encrypt
														
 
															+
														
 
															+         # Handle the last <16 Byte block seperately
														
 
															+	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
														
 
															+	pshufb SHUF_MASK(%rip), %xmm0
														
 
															+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
														
 
															+	sub $16, %r11
														
 
															+	add %r13, %r11
														
 
															+	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
														
 
															+	lea SHIFT_MASK+16(%rip), %r12
														
 
															+	sub %r13, %r12
														
 
															+	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
														
 
															+	# (%r13 is the number of bytes in plaintext mod 16)
														
 
															+	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
														
 
															+	pshufb	%xmm2, %xmm1            # shift right 16-r13 byte
														
 
															+	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
														
 
															+	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
														
 
															+	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
														
 
															+	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
														
 
															+
														
 
															+	pshufb	SHUF_MASK(%rip),%xmm0
														
 
															+	pxor	%xmm0, %xmm8
														
 
															+	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
														
 
															+	# GHASH computation for the last <16 byte block
														
 
															+	sub	%r13, %r11
														
 
															+	add	$16, %r11
														
 
															+	pshufb SHUF_MASK(%rip), %xmm0
														
 
															+	# shuffle xmm0 back to output as ciphertext
														
 
															+
														
 
															+        # Output %r13 bytes
														
 
															+	movq %xmm0, %rax
														
 
															+	cmp $8, %r13
														
 
															+	jle _less_than_8_bytes_left_encrypt
														
 
															+	mov %rax, (%arg2 , %r11, 1)
														
 
															+	add $8, %r11
														
 
															+	psrldq $8, %xmm0
														
 
															+	movq %xmm0, %rax
														
 
															+	sub $8, %r13
														
 
															+_less_than_8_bytes_left_encrypt:
														
 
															+	mov %al,  (%arg2, %r11, 1)
														
 
															+	add $1, %r11
														
 
															+	shr $8, %rax
														
 
															+	sub $1, %r13
														
 
															+	jne _less_than_8_bytes_left_encrypt
														
 
															+_multiple_of_16_bytes_encrypt:
														
 
															+	mov	arg8, %r12    # %r12 = addLen (number of bytes)
														
 
															+	shl	$3, %r12
														
 
															+	movd	%r12d, %xmm15       # len(A) in %xmm15
														
 
															+	shl	$3, %arg4               # len(C) in bits (*128)
														
 
															+	movq	%arg4, %xmm1
														
 
															+	pslldq	$8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
														
 
															+	pxor	%xmm1, %xmm15       # %xmm15 = len(A)||len(C)
														
 
															+	pxor	%xmm15, %xmm8
														
 
															+	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
														
 
															+	# final GHASH computation
														
 
															+
														
 
															+	pshufb	SHUF_MASK(%rip), %xmm8         # perform a 16 byte swap
														
 
															+	mov	%arg5, %rax		       # %rax  = *Y0
														
 
															+	movdqu	(%rax), %xmm0		       # %xmm0 = Y0
														
 
															+	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm15         # Encrypt(K, Y0)
														
 
															+	pxor	%xmm8, %xmm0
														
 
															+_return_T_encrypt:
														
 
															+	mov	arg9, %r10                     # %r10 = authTag
														
 
															+	mov	arg10, %r11                    # %r11 = auth_tag_len
														
 
															+	cmp	$16, %r11
														
 
															+	je	_T_16_encrypt
														
 
															+	cmp	$12, %r11
														
 
															+	je	_T_12_encrypt
														
 
															+_T_8_encrypt:
														
 
															+	movq	%xmm0, %rax
														
 
															+	mov	%rax, (%r10)
														
 
															+	jmp	_return_T_done_encrypt
														
 
															+_T_12_encrypt:
														
 
															+	movq	%xmm0, %rax
														
 
															+	mov	%rax, (%r10)
														
 
															+	psrldq	$8, %xmm0
														
 
															+	movd	%xmm0, %eax
														
 
															+	mov	%eax, 8(%r10)
														
 
															+	jmp	_return_T_done_encrypt
														
 
															+_T_16_encrypt:
														
 
															+	movdqu	%xmm0, (%r10)
														
 
															+_return_T_done_encrypt:
														
 
															+	mov	%r14, %rsp
														
 
															+	pop	%r14
														
 
															+	pop	%r13
														
 
															+	pop	%r12
														
 
															+	ret
														
 
															+
														
 
															+
														
 
															+
														
 
															 _key_expansion_128:
														
 
															 _key_expansion_256a:
														
 
															 	pshufd $0b11111111, %xmm1, %xmm1
														
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
 
															  * Copyright (C) 2008, Intel Corp.
														
 
															  *    Author: Huang Ying <ying.huang@intel.com>
														
 
															  *
														
 
															+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
														
 
															+ * interface for 64-bit kernels.
														
 
															+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
														
 
															+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
														
 
															+ *             Tadeusz Struk (tadeusz.struk@intel.com)
														
 
															+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
														
 
															+ *    Copyright (c) 2010, Intel Corporation.
														
 
															+ *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
 
															  * the Free Software Foundation; either version 2 of the License, or
														
@@ -21,6 +29,10 @@
 
															 #include <crypto/ctr.h>
														
 
															 #include <asm/i387.h>
														
 
															 #include <asm/aes.h>
														
 
															+#include <crypto/scatterwalk.h>
														
 
															+#include <crypto/internal/aead.h>
														
 
															+#include <linux/workqueue.h>
														
 
															+#include <linux/spinlock.h>
														
 
															 #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
														
 
															 #define HAS_CTR
														
@@ -42,8 +54,31 @@ struct async_aes_ctx {
 
															 	struct cryptd_ablkcipher *cryptd_tfm;
														
 
															 };
														
 
															-#define AESNI_ALIGN	16
														
 
															+/* This data is stored at the end of the crypto_tfm struct.
														
 
															+ * It's a type of per "session" data storage location.
														
 
															+ * This needs to be 16 byte aligned.
														
 
															+ */
														
 
															+struct aesni_rfc4106_gcm_ctx {
														
 
															+	u8 hash_subkey[16];
														
 
															+	struct crypto_aes_ctx aes_key_expanded;
														
 
															+	u8 nonce[4];
														
 
															+	struct cryptd_aead *cryptd_tfm;
														
 
															+};
														
 
															+
														
 
															+struct aesni_gcm_set_hash_subkey_result {
														
 
															+	int err;
														
 
															+	struct completion completion;
														
 
															+};
														
 
															+
														
 
															+struct aesni_hash_subkey_req_data {
														
 
															+	u8 iv[16];
														
 
															+	struct aesni_gcm_set_hash_subkey_result result;
														
 
															+	struct scatterlist sg;
														
 
															+};
														
 
															+
														
 
															+#define AESNI_ALIGN	(16)
														
 
															 #define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
														
 
															+#define RFC4106_HASH_SUBKEY_SIZE 16
														
 
															 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
														
 
															 			     unsigned int key_len);
														
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 
															 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
														
 
															 			      const u8 *in, unsigned int len, u8 *iv);
														
 
															+/* asmlinkage void aesni_gcm_enc()
														
 
															+ * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
														
 
															+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
														
 
															+ * const u8 *in, Plaintext input
														
 
															+ * unsigned long plaintext_len, Length of data in bytes for encryption.
														
 
															+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
														
 
															+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
														
 
															+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
														
 
															+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
														
 
															+ * const u8 *aad, Additional Authentication Data (AAD)
														
 
															+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
														
 
															+ *          is going to be 8 or 12 bytes
														
 
															+ * u8 *auth_tag, Authenticated Tag output.
														
 
															+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
														
 
															+ *          Valid values are 16 (most likely), 12 or 8.
														
 
															+ */
														
 
															+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
														
 
															+			const u8 *in, unsigned long plaintext_len, u8 *iv,
														
 
															+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
														
 
															+			u8 *auth_tag, unsigned long auth_tag_len);
														
 
															+
														
 
															+/* asmlinkage void aesni_gcm_dec()
														
 
															+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
														
 
															+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
														
 
															+ * const u8 *in, Ciphertext input
														
 
															+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
														
 
															+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
														
 
															+ *         concatenated with 8 byte Initialisation Vector (from IPSec ESP
														
 
															+ *         Payload) concatenated with 0x00000001. 16-byte aligned pointer.
														
 
															+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
														
 
															+ * const u8 *aad, Additional Authentication Data (AAD)
														
 
															+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
														
 
															+ * to be 8 or 12 bytes
														
 
															+ * u8 *auth_tag, Authenticated Tag output.
														
 
															+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
														
 
															+ * Valid values are 16 (most likely), 12 or 8.
														
 
															+ */
														
 
															+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
														
 
															+			const u8 *in, unsigned long ciphertext_len, u8 *iv,
														
 
															+			u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
														
 
															+			u8 *auth_tag, unsigned long auth_tag_len);
														
 
															+
														
 
															+static inline struct
														
 
															+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
														
 
															+{
														
 
															+	return
														
 
															+		(struct aesni_rfc4106_gcm_ctx *)
														
 
															+		PTR_ALIGN((u8 *)
														
 
															+		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
														
 
															+}
														
 
															+
														
 
															 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
														
 
															 {
														
 
															 	unsigned long addr = (unsigned long)raw_ctx;
														
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
 
															 };
														
 
															 #endif
														
 
															+static int rfc4106_init(struct crypto_tfm *tfm)
														
 
															+{
														
 
															+	struct cryptd_aead *cryptd_tfm;
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
														
 
															+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
														
 
															+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
														
 
															+	if (IS_ERR(cryptd_tfm))
														
 
															+		return PTR_ERR(cryptd_tfm);
														
 
															+	ctx->cryptd_tfm = cryptd_tfm;
														
 
															+	tfm->crt_aead.reqsize = sizeof(struct aead_request)
														
 
															+		+ crypto_aead_reqsize(&cryptd_tfm->base);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void rfc4106_exit(struct crypto_tfm *tfm)
														
 
															+{
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx =
														
 
															+		(struct aesni_rfc4106_gcm_ctx *)
														
 
															+		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
														
 
															+	if (!IS_ERR(ctx->cryptd_tfm))
														
 
															+		cryptd_free_aead(ctx->cryptd_tfm);
														
 
															+	return;
														
 
															+}
														
 
															+
														
 
															+static void
														
 
															+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
														
 
															+{
														
 
															+	struct aesni_gcm_set_hash_subkey_result *result = req->data;
														
 
															+
														
 
															+	if (err == -EINPROGRESS)
														
 
															+		return;
														
 
															+	result->err = err;
														
 
															+	complete(&result->completion);
														
 
															+}
														
 
															+
														
 
															+static int
														
 
															+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
														
 
															+{
														
 
															+	struct crypto_ablkcipher *ctr_tfm;
														
 
															+	struct ablkcipher_request *req;
														
 
															+	int ret = -EINVAL;
														
 
															+	struct aesni_hash_subkey_req_data *req_data;
														
 
															+
														
 
															+	ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
														
 
															+	if (IS_ERR(ctr_tfm))
														
 
															+		return PTR_ERR(ctr_tfm);
														
 
															+
														
 
															+	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
														
 
															+
														
 
															+	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
														
 
															+	if (ret) {
														
 
															+		crypto_free_ablkcipher(ctr_tfm);
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
														
 
															+	if (!req) {
														
 
															+		crypto_free_ablkcipher(ctr_tfm);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
														
 
															+	if (!req_data) {
														
 
															+		crypto_free_ablkcipher(ctr_tfm);
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+	memset(req_data->iv, 0, sizeof(req_data->iv));
														
 
															+
														
 
															+	/* Clear the data in the hash sub key container to zero.*/
														
 
															+	/* We want to cipher all zeros to create the hash sub key. */
														
 
															+	memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
														
 
															+
														
 
															+	init_completion(&req_data->result.completion);
														
 
															+	sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
														
 
															+	ablkcipher_request_set_tfm(req, ctr_tfm);
														
 
															+	ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
														
 
															+					CRYPTO_TFM_REQ_MAY_BACKLOG,
														
 
															+					rfc4106_set_hash_subkey_done,
														
 
															+					&req_data->result);
														
 
															+
														
 
															+	ablkcipher_request_set_crypt(req, &req_data->sg,
														
 
															+		&req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
														
 
															+
														
 
															+	ret = crypto_ablkcipher_encrypt(req);
														
 
															+	if (ret == -EINPROGRESS || ret == -EBUSY) {
														
 
															+		ret = wait_for_completion_interruptible
														
 
															+			(&req_data->result.completion);
														
 
															+		if (!ret)
														
 
															+			ret = req_data->result.err;
														
 
															+	}
														
 
															+	ablkcipher_request_free(req);
														
 
															+	kfree(req_data);
														
 
															+	crypto_free_ablkcipher(ctr_tfm);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
														
 
															+						   unsigned int key_len)
														
 
															+{
														
 
															+	int ret = 0;
														
 
															+	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
														
 
															+	u8 *new_key_mem = NULL;
														
 
															+
														
 
															+	if (key_len < 4) {
														
 
															+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+	/*Account for 4 byte nonce at the end.*/
														
 
															+	key_len -= 4;
														
 
															+	if (key_len != AES_KEYSIZE_128) {
														
 
															+		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
														
 
															+	/*This must be on a 16 byte boundary!*/
														
 
															+	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if ((unsigned long)key % AESNI_ALIGN) {
														
 
															+		/*key is not aligned: use an auxuliar aligned pointer*/
														
 
															+		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
														
 
															+		if (!new_key_mem)
														
 
															+			return -ENOMEM;
														
 
															+
														
 
															+		new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
														
 
															+		memcpy(new_key_mem, key, key_len);
														
 
															+		key = new_key_mem;
														
 
															+	}
														
 
															+
														
 
															+	if (!irq_fpu_usable())
														
 
															+		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
														
 
															+		key, key_len);
														
 
															+	else {
														
 
															+		kernel_fpu_begin();
														
 
															+		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
														
 
															+		kernel_fpu_end();
														
 
															+	}
														
 
															+	/*This must be on a 16 byte boundary!*/
														
 
															+	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
														
 
															+		ret = -EINVAL;
														
 
															+		goto exit;
														
 
															+	}
														
 
															+	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
														
 
															+exit:
														
 
															+	kfree(new_key_mem);
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/* This is the Integrity Check Value (aka the authentication tag length and can
														
 
															+ * be 8, 12 or 16 bytes long. */
														
 
															+static int rfc4106_set_authsize(struct crypto_aead *parent,
														
 
															+				unsigned int authsize)
														
 
															+{
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
														
 
															+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
														
 
															+
														
 
															+	switch (authsize) {
														
 
															+	case 8:
														
 
															+	case 12:
														
 
															+	case 16:
														
 
															+		break;
														
 
															+	default:
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+	crypto_aead_crt(parent)->authsize = authsize;
														
 
															+	crypto_aead_crt(cryptd_child)->authsize = authsize;
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int rfc4106_encrypt(struct aead_request *req)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
														
 
															+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
														
 
															+
														
 
															+	if (!irq_fpu_usable()) {
														
 
															+		struct aead_request *cryptd_req =
														
 
															+			(struct aead_request *) aead_request_ctx(req);
														
 
															+		memcpy(cryptd_req, req, sizeof(*req));
														
 
															+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
														
 
															+		return crypto_aead_encrypt(cryptd_req);
														
 
															+	} else {
														
 
															+		kernel_fpu_begin();
														
 
															+		ret = cryptd_child->base.crt_aead.encrypt(req);
														
 
															+		kernel_fpu_end();
														
 
															+		return ret;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int rfc4106_decrypt(struct aead_request *req)
														
 
															+{
														
 
															+	int ret;
														
 
															+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
														
 
															+	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
														
 
															+
														
 
															+	if (!irq_fpu_usable()) {
														
 
															+		struct aead_request *cryptd_req =
														
 
															+			(struct aead_request *) aead_request_ctx(req);
														
 
															+		memcpy(cryptd_req, req, sizeof(*req));
														
 
															+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
														
 
															+		return crypto_aead_decrypt(cryptd_req);
														
 
															+	} else {
														
 
															+		kernel_fpu_begin();
														
 
															+		ret = cryptd_child->base.crt_aead.decrypt(req);
														
 
															+		kernel_fpu_end();
														
 
															+		return ret;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg rfc4106_alg = {
														
 
															+	.cra_name = "rfc4106(gcm(aes))",
														
 
															+	.cra_driver_name = "rfc4106-gcm-aesni",
														
 
															+	.cra_priority = 400,
														
 
															+	.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize = 1,
														
 
															+	.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
														
 
															+	.cra_alignmask = 0,
														
 
															+	.cra_type = &crypto_nivaead_type,
														
 
															+	.cra_module = THIS_MODULE,
														
 
															+	.cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
														
 
															+	.cra_init = rfc4106_init,
														
 
															+	.cra_exit = rfc4106_exit,
														
 
															+	.cra_u = {
														
 
															+		.aead = {
														
 
															+			.setkey = rfc4106_set_key,
														
 
															+			.setauthsize = rfc4106_set_authsize,
														
 
															+			.encrypt = rfc4106_encrypt,
														
 
															+			.decrypt = rfc4106_decrypt,
														
 
															+			.geniv = "seqiv",
														
 
															+			.ivsize = 8,
														
 
															+			.maxauthsize = 16,
														
 
															+		},
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															+static int __driver_rfc4106_encrypt(struct aead_request *req)
														
 
															+{
														
 
															+	u8 one_entry_in_sg = 0;
														
 
															+	u8 *src, *dst, *assoc;
														
 
															+	__be32 counter = cpu_to_be32(1);
														
 
															+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
														
 
															+	void *aes_ctx = &(ctx->aes_key_expanded);
														
 
															+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
														
 
															+	u8 iv_tab[16+AESNI_ALIGN];
														
 
															+	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
														
 
															+	struct scatter_walk src_sg_walk;
														
 
															+	struct scatter_walk assoc_sg_walk;
														
 
															+	struct scatter_walk dst_sg_walk;
														
 
															+	unsigned int i;
														
 
															+
														
 
															+	/* Assuming we are supporting rfc4106 64-bit extended */
														
 
															+	/* sequence numbers We need to have the AAD length equal */
														
 
															+	/* to 8 or 12 bytes */
														
 
															+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
														
 
															+		return -EINVAL;
														
 
															+	/* IV below built */
														
 
															+	for (i = 0; i < 4; i++)
														
 
															+		*(iv+i) = ctx->nonce[i];
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		*(iv+4+i) = req->iv[i];
														
 
															+	*((__be32 *)(iv+12)) = counter;
														
 
															+
														
 
															+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
														
 
															+		one_entry_in_sg = 1;
														
 
															+		scatterwalk_start(&src_sg_walk, req->src);
														
 
															+		scatterwalk_start(&assoc_sg_walk, req->assoc);
														
 
															+		src = scatterwalk_map(&src_sg_walk, 0);
														
 
															+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
														
 
															+		dst = src;
														
 
															+		if (unlikely(req->src != req->dst)) {
														
 
															+			scatterwalk_start(&dst_sg_walk, req->dst);
														
 
															+			dst = scatterwalk_map(&dst_sg_walk, 0);
														
 
															+		}
														
 
															+
														
 
															+	} else {
														
 
															+		/* Allocate memory for src, dst, assoc */
														
 
															+		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
														
 
															+			GFP_ATOMIC);
														
 
															+		if (unlikely(!src))
														
 
															+			return -ENOMEM;
														
 
															+		assoc = (src + req->cryptlen + auth_tag_len);
														
 
															+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
														
 
															+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
														
 
															+					req->assoclen, 0);
														
 
															+		dst = src;
														
 
															+	}
														
 
															+
														
 
															+	aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
														
 
															+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
														
 
															+		+ ((unsigned long)req->cryptlen), auth_tag_len);
														
 
															+
														
 
															+	/* The authTag (aka the Integrity Check Value) needs to be written
														
 
															+	 * back to the packet. */
														
 
															+	if (one_entry_in_sg) {
														
 
															+		if (unlikely(req->src != req->dst)) {
														
 
															+			scatterwalk_unmap(dst, 0);
														
 
															+			scatterwalk_done(&dst_sg_walk, 0, 0);
														
 
															+		}
														
 
															+		scatterwalk_unmap(src, 0);
														
 
															+		scatterwalk_unmap(assoc, 0);
														
 
															+		scatterwalk_done(&src_sg_walk, 0, 0);
														
 
															+		scatterwalk_done(&assoc_sg_walk, 0, 0);
														
 
															+	} else {
														
 
															+		scatterwalk_map_and_copy(dst, req->dst, 0,
														
 
															+			req->cryptlen + auth_tag_len, 1);
														
 
															+		kfree(src);
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __driver_rfc4106_decrypt(struct aead_request *req)
														
 
															+{
														
 
															+	u8 one_entry_in_sg = 0;
														
 
															+	u8 *src, *dst, *assoc;
														
 
															+	unsigned long tempCipherLen = 0;
														
 
															+	__be32 counter = cpu_to_be32(1);
														
 
															+	int retval = 0;
														
 
															+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
														
 
															+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
														
 
															+	void *aes_ctx = &(ctx->aes_key_expanded);
														
 
															+	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
														
 
															+	u8 iv_and_authTag[32+AESNI_ALIGN];
														
 
															+	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
														
 
															+	u8 *authTag = iv + 16;
														
 
															+	struct scatter_walk src_sg_walk;
														
 
															+	struct scatter_walk assoc_sg_walk;
														
 
															+	struct scatter_walk dst_sg_walk;
														
 
															+	unsigned int i;
														
 
															+
														
 
															+	if (unlikely((req->cryptlen < auth_tag_len) ||
														
 
															+		(req->assoclen != 8 && req->assoclen != 12)))
														
 
															+		return -EINVAL;
														
 
															+	/* Assuming we are supporting rfc4106 64-bit extended */
														
 
															+	/* sequence numbers We need to have the AAD length */
														
 
															+	/* equal to 8 or 12 bytes */
														
 
															+
														
 
															+	tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
														
 
															+	/* IV below built */
														
 
															+	for (i = 0; i < 4; i++)
														
 
															+		*(iv+i) = ctx->nonce[i];
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		*(iv+4+i) = req->iv[i];
														
 
															+	*((__be32 *)(iv+12)) = counter;
														
 
															+
														
 
															+	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
														
 
															+		one_entry_in_sg = 1;
														
 
															+		scatterwalk_start(&src_sg_walk, req->src);
														
 
															+		scatterwalk_start(&assoc_sg_walk, req->assoc);
														
 
															+		src = scatterwalk_map(&src_sg_walk, 0);
														
 
															+		assoc = scatterwalk_map(&assoc_sg_walk, 0);
														
 
															+		dst = src;
														
 
															+		if (unlikely(req->src != req->dst)) {
														
 
															+			scatterwalk_start(&dst_sg_walk, req->dst);
														
 
															+			dst = scatterwalk_map(&dst_sg_walk, 0);
														
 
															+		}
														
 
															+
														
 
															+	} else {
														
 
															+		/* Allocate memory for src, dst, assoc */
														
 
															+		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
														
 
															+		if (!src)
														
 
															+			return -ENOMEM;
														
 
															+		assoc = (src + req->cryptlen + auth_tag_len);
														
 
															+		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
														
 
															+		scatterwalk_map_and_copy(assoc, req->assoc, 0,
														
 
															+			req->assoclen, 0);
														
 
															+		dst = src;
														
 
															+	}
														
 
															+
														
 
															+	aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
														
 
															+		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
														
 
															+		authTag, auth_tag_len);
														
 
															+
														
 
															+	/* Compare generated tag with passed in tag. */
														
 
															+	retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
														
 
															+		-EBADMSG : 0;
														
 
															+
														
 
															+	if (one_entry_in_sg) {
														
 
															+		if (unlikely(req->src != req->dst)) {
														
 
															+			scatterwalk_unmap(dst, 0);
														
 
															+			scatterwalk_done(&dst_sg_walk, 0, 0);
														
 
															+		}
														
 
															+		scatterwalk_unmap(src, 0);
														
 
															+		scatterwalk_unmap(assoc, 0);
														
 
															+		scatterwalk_done(&src_sg_walk, 0, 0);
														
 
															+		scatterwalk_done(&assoc_sg_walk, 0, 0);
														
 
															+	} else {
														
 
															+		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
														
 
															+		kfree(src);
														
 
															+	}
														
 
															+	return retval;
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg __rfc4106_alg = {
														
 
															+	.cra_name		= "__gcm-aes-aesni",
														
 
															+	.cra_driver_name	= "__driver-gcm-aes-aesni",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize	= sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_aead_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(__rfc4106_alg.cra_list),
														
 
															+	.cra_u = {
														
 
															+		.aead = {
														
 
															+			.encrypt	= __driver_rfc4106_encrypt,
														
 
															+			.decrypt	= __driver_rfc4106_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															 static int __init aesni_init(void)
														
 
															 {
														
 
															 	int err;
														
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
 
															 		printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
														
 
															 		return -ENODEV;
														
 
															 	}
														
 
															+
														
 
															 	if ((err = crypto_register_alg(&aesni_alg)))
														
 
															 		goto aes_err;
														
 
															 	if ((err = crypto_register_alg(&__aesni_alg)))
														
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
 
															 	if ((err = crypto_register_alg(&ablk_xts_alg)))
														
 
															 		goto ablk_xts_err;
														
 
															 #endif
														
 
															-
														
 
															+	err = crypto_register_alg(&__rfc4106_alg);
														
 
															+	if (err)
														
 
															+		goto __aead_gcm_err;
														
 
															+	err = crypto_register_alg(&rfc4106_alg);
														
 
															+	if (err)
														
 
															+		goto aead_gcm_err;
														
 
															 	return err;
														
 
															+aead_gcm_err:
														
 
															+	crypto_unregister_alg(&__rfc4106_alg);
														
 
															+__aead_gcm_err:
														
 
															 #ifdef HAS_XTS
														
 
															+	crypto_unregister_alg(&ablk_xts_alg);
														
 
															 ablk_xts_err:
														
 
															 #endif
														
 
															 #ifdef HAS_PCBC
														
@@ -809,6 +1321,8 @@ aes_err:
 
															 static void __exit aesni_exit(void)
														
 
															 {
														
 
															+	crypto_unregister_alg(&__rfc4106_alg);
														
 
															+	crypto_unregister_alg(&rfc4106_alg);
														
 
															 #ifdef HAS_XTS
														
 
															 	crypto_unregister_alg(&ablk_xts_alg);
														
 
															 #endif