aesni-intel_asm.S 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * This program is free software; you can redistribute it and/or modify
  24. * it under the terms of the GNU General Public License as published by
  25. * the Free Software Foundation; either version 2 of the License, or
  26. * (at your option) any later version.
  27. */
  28. #include <linux/linkage.h>
  29. #include <asm/inst.h>
  30. .data
  31. POLY: .octa 0xC2000000000000000000000000000001
  32. TWOONE: .octa 0x00000001000000000000000000000001
  33. # order of these constants should not change.
  34. # more specifically, ALL_F should follow SHIFT_MASK,
  35. # and ZERO should follow ALL_F
  36. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  37. MASK1: .octa 0x0000000000000000ffffffffffffffff
  38. MASK2: .octa 0xffffffffffffffff0000000000000000
  39. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  40. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  41. ZERO: .octa 0x00000000000000000000000000000000
  42. ONE: .octa 0x00000000000000000000000000000001
  43. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  44. dec: .octa 0x1
  45. enc: .octa 0x2
  46. .text
  47. #define STACK_OFFSET 8*3
  48. #define HashKey 16*0 // store HashKey <<1 mod poly here
  49. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  50. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  51. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  52. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  53. // bits of HashKey <<1 mod poly here
  54. //(for Karatsuba purposes)
  55. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  56. // bits of HashKey^2 <<1 mod poly here
  57. // (for Karatsuba purposes)
  58. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  59. // bits of HashKey^3 <<1 mod poly here
  60. // (for Karatsuba purposes)
  61. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  62. // bits of HashKey^4 <<1 mod poly here
  63. // (for Karatsuba purposes)
  64. #define VARIABLE_OFFSET 16*8
  65. #define arg1 rdi
  66. #define arg2 rsi
  67. #define arg3 rdx
  68. #define arg4 rcx
  69. #define arg5 r8
  70. #define arg6 r9
  71. #define arg7 STACK_OFFSET+8(%r14)
  72. #define arg8 STACK_OFFSET+16(%r14)
  73. #define arg9 STACK_OFFSET+24(%r14)
  74. #define arg10 STACK_OFFSET+32(%r14)
  75. #define STATE1 %xmm0
  76. #define STATE2 %xmm4
  77. #define STATE3 %xmm5
  78. #define STATE4 %xmm6
  79. #define STATE STATE1
  80. #define IN1 %xmm1
  81. #define IN2 %xmm7
  82. #define IN3 %xmm8
  83. #define IN4 %xmm9
  84. #define IN IN1
  85. #define KEY %xmm2
  86. #define IV %xmm3
  87. #define BSWAP_MASK %xmm10
  88. #define CTR %xmm11
  89. #define INC %xmm12
  90. #define KEYP %rdi
  91. #define OUTP %rsi
  92. #define INP %rdx
  93. #define LEN %rcx
  94. #define IVP %r8
  95. #define KLEN %r9d
  96. #define T1 %r10
  97. #define TKEYP T1
  98. #define T2 %r11
  99. #define TCTR_LOW T2
  100. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  101. *
  102. *
  103. * Input: A and B (128-bits each, bit-reflected)
  104. * Output: C = A*B*x mod poly, (i.e. >>1 )
  105. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  106. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  107. *
  108. */
  109. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  110. movdqa \GH, \TMP1
  111. pshufd $78, \GH, \TMP2
  112. pshufd $78, \HK, \TMP3
  113. pxor \GH, \TMP2 # TMP2 = a1+a0
  114. pxor \HK, \TMP3 # TMP3 = b1+b0
  115. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  116. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  117. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  118. pxor \GH, \TMP2
  119. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  120. movdqa \TMP2, \TMP3
  121. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  122. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  123. pxor \TMP3, \GH
  124. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  125. # first phase of the reduction
  126. movdqa \GH, \TMP2
  127. movdqa \GH, \TMP3
  128. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  129. # in in order to perform
  130. # independent shifts
  131. pslld $31, \TMP2 # packed right shift <<31
  132. pslld $30, \TMP3 # packed right shift <<30
  133. pslld $25, \TMP4 # packed right shift <<25
  134. pxor \TMP3, \TMP2 # xor the shifted versions
  135. pxor \TMP4, \TMP2
  136. movdqa \TMP2, \TMP5
  137. psrldq $4, \TMP5 # right shift TMP5 1 DW
  138. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  139. pxor \TMP2, \GH
  140. # second phase of the reduction
  141. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  142. # in in order to perform
  143. # independent shifts
  144. movdqa \GH,\TMP3
  145. movdqa \GH,\TMP4
  146. psrld $1,\TMP2 # packed left shift >>1
  147. psrld $2,\TMP3 # packed left shift >>2
  148. psrld $7,\TMP4 # packed left shift >>7
  149. pxor \TMP3,\TMP2 # xor the shifted versions
  150. pxor \TMP4,\TMP2
  151. pxor \TMP5, \TMP2
  152. pxor \TMP2, \GH
  153. pxor \TMP1, \GH # result is in TMP1
  154. .endm
  155. /*
  156. * if a = number of total plaintext bytes
  157. * b = floor(a/16)
  158. * num_initial_blocks = b mod 4
  159. * encrypt the initial num_initial_blocks blocks and apply ghash on
  160. * the ciphertext
  161. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  162. * are clobbered
  163. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  164. */
  165. .macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  166. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  167. mov arg7, %r10 # %r10 = AAD
  168. mov arg8, %r12 # %r12 = aadLen
  169. mov %r12, %r11
  170. pxor %xmm\i, %xmm\i
  171. _get_AAD_loop\num_initial_blocks\operation:
  172. movd (%r10), \TMP1
  173. pslldq $12, \TMP1
  174. psrldq $4, %xmm\i
  175. pxor \TMP1, %xmm\i
  176. add $4, %r10
  177. sub $4, %r12
  178. jne _get_AAD_loop\num_initial_blocks\operation
  179. cmp $16, %r11
  180. je _get_AAD_loop2_done\num_initial_blocks\operation
  181. mov $16, %r12
  182. _get_AAD_loop2\num_initial_blocks\operation:
  183. psrldq $4, %xmm\i
  184. sub $4, %r12
  185. cmp %r11, %r12
  186. jne _get_AAD_loop2\num_initial_blocks\operation
  187. _get_AAD_loop2_done\num_initial_blocks\operation:
  188. pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
  189. xor %r11, %r11 # initialise the data pointer offset as zero
  190. # start AES for num_initial_blocks blocks
  191. mov %arg5, %rax # %rax = *Y0
  192. movdqu (%rax), \XMM0 # XMM0 = Y0
  193. pshufb SHUF_MASK(%rip), \XMM0
  194. .if \i_seq != 0
  195. .irpc index, \i_seq
  196. paddd ONE(%rip), \XMM0 # INCR Y0
  197. movdqa \XMM0, %xmm\index
  198. pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
  199. .endr
  200. .irpc index, \i_seq
  201. pxor 16*0(%arg1), %xmm\index
  202. .endr
  203. .irpc index, \i_seq
  204. movaps 0x10(%rdi), \TMP1
  205. AESENC \TMP1, %xmm\index # Round 1
  206. .endr
  207. .irpc index, \i_seq
  208. movaps 0x20(%arg1), \TMP1
  209. AESENC \TMP1, %xmm\index # Round 2
  210. .endr
  211. .irpc index, \i_seq
  212. movaps 0x30(%arg1), \TMP1
  213. AESENC \TMP1, %xmm\index # Round 2
  214. .endr
  215. .irpc index, \i_seq
  216. movaps 0x40(%arg1), \TMP1
  217. AESENC \TMP1, %xmm\index # Round 2
  218. .endr
  219. .irpc index, \i_seq
  220. movaps 0x50(%arg1), \TMP1
  221. AESENC \TMP1, %xmm\index # Round 2
  222. .endr
  223. .irpc index, \i_seq
  224. movaps 0x60(%arg1), \TMP1
  225. AESENC \TMP1, %xmm\index # Round 2
  226. .endr
  227. .irpc index, \i_seq
  228. movaps 0x70(%arg1), \TMP1
  229. AESENC \TMP1, %xmm\index # Round 2
  230. .endr
  231. .irpc index, \i_seq
  232. movaps 0x80(%arg1), \TMP1
  233. AESENC \TMP1, %xmm\index # Round 2
  234. .endr
  235. .irpc index, \i_seq
  236. movaps 0x90(%arg1), \TMP1
  237. AESENC \TMP1, %xmm\index # Round 2
  238. .endr
  239. .irpc index, \i_seq
  240. movaps 0xa0(%arg1), \TMP1
  241. AESENCLAST \TMP1, %xmm\index # Round 10
  242. .endr
  243. .irpc index, \i_seq
  244. movdqu (%arg3 , %r11, 1), \TMP1
  245. pxor \TMP1, %xmm\index
  246. movdqu %xmm\index, (%arg2 , %r11, 1)
  247. # write back plaintext/ciphertext for num_initial_blocks
  248. add $16, %r11
  249. .if \operation == dec
  250. movdqa \TMP1, %xmm\index
  251. .endif
  252. pshufb SHUF_MASK(%rip), %xmm\index
  253. # prepare plaintext/ciphertext for GHASH computation
  254. .endr
  255. .endif
  256. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  257. # apply GHASH on num_initial_blocks blocks
  258. .if \i == 5
  259. pxor %xmm5, %xmm6
  260. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  261. pxor %xmm6, %xmm7
  262. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  263. pxor %xmm7, %xmm8
  264. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  265. .elseif \i == 6
  266. pxor %xmm6, %xmm7
  267. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  268. pxor %xmm7, %xmm8
  269. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  270. .elseif \i == 7
  271. pxor %xmm7, %xmm8
  272. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  273. .endif
  274. cmp $64, %r13
  275. jl _initial_blocks_done\num_initial_blocks\operation
  276. # no need for precomputed values
  277. /*
  278. *
  279. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  280. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  281. */
  282. paddd ONE(%rip), \XMM0 # INCR Y0
  283. movdqa \XMM0, \XMM1
  284. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  285. paddd ONE(%rip), \XMM0 # INCR Y0
  286. movdqa \XMM0, \XMM2
  287. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  288. paddd ONE(%rip), \XMM0 # INCR Y0
  289. movdqa \XMM0, \XMM3
  290. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  291. paddd ONE(%rip), \XMM0 # INCR Y0
  292. movdqa \XMM0, \XMM4
  293. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  294. pxor 16*0(%arg1), \XMM1
  295. pxor 16*0(%arg1), \XMM2
  296. pxor 16*0(%arg1), \XMM3
  297. pxor 16*0(%arg1), \XMM4
  298. movdqa \TMP3, \TMP5
  299. pshufd $78, \TMP3, \TMP1
  300. pxor \TMP3, \TMP1
  301. movdqa \TMP1, HashKey_k(%rsp)
  302. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  303. # TMP5 = HashKey^2<<1 (mod poly)
  304. movdqa \TMP5, HashKey_2(%rsp)
  305. # HashKey_2 = HashKey^2<<1 (mod poly)
  306. pshufd $78, \TMP5, \TMP1
  307. pxor \TMP5, \TMP1
  308. movdqa \TMP1, HashKey_2_k(%rsp)
  309. .irpc index, 1234 # do 4 rounds
  310. movaps 0x10*\index(%arg1), \TMP1
  311. AESENC \TMP1, \XMM1
  312. AESENC \TMP1, \XMM2
  313. AESENC \TMP1, \XMM3
  314. AESENC \TMP1, \XMM4
  315. .endr
  316. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  317. # TMP5 = HashKey^3<<1 (mod poly)
  318. movdqa \TMP5, HashKey_3(%rsp)
  319. pshufd $78, \TMP5, \TMP1
  320. pxor \TMP5, \TMP1
  321. movdqa \TMP1, HashKey_3_k(%rsp)
  322. .irpc index, 56789 # do next 5 rounds
  323. movaps 0x10*\index(%arg1), \TMP1
  324. AESENC \TMP1, \XMM1
  325. AESENC \TMP1, \XMM2
  326. AESENC \TMP1, \XMM3
  327. AESENC \TMP1, \XMM4
  328. .endr
  329. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  330. # TMP5 = HashKey^3<<1 (mod poly)
  331. movdqa \TMP5, HashKey_4(%rsp)
  332. pshufd $78, \TMP5, \TMP1
  333. pxor \TMP5, \TMP1
  334. movdqa \TMP1, HashKey_4_k(%rsp)
  335. movaps 0xa0(%arg1), \TMP2
  336. AESENCLAST \TMP2, \XMM1
  337. AESENCLAST \TMP2, \XMM2
  338. AESENCLAST \TMP2, \XMM3
  339. AESENCLAST \TMP2, \XMM4
  340. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  341. pxor \TMP1, \XMM1
  342. .if \operation == dec
  343. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  344. movdqa \TMP1, \XMM1
  345. .endif
  346. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  347. pxor \TMP1, \XMM2
  348. .if \operation == dec
  349. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  350. movdqa \TMP1, \XMM2
  351. .endif
  352. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  353. pxor \TMP1, \XMM3
  354. .if \operation == dec
  355. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  356. movdqa \TMP1, \XMM3
  357. .endif
  358. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  359. pxor \TMP1, \XMM4
  360. .if \operation == dec
  361. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  362. movdqa \TMP1, \XMM4
  363. .else
  364. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  365. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  366. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  367. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  368. .endif
  369. add $64, %r11
  370. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  371. pxor \XMMDst, \XMM1
  372. # combine GHASHed value with the corresponding ciphertext
  373. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  374. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  375. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  376. _initial_blocks_done\num_initial_blocks\operation:
  377. .endm
  378. /*
  379. * encrypt 4 blocks at a time
  380. * ghash the 4 previously encrypted ciphertext blocks
  381. * arg1, %arg2, %arg3 are used as pointers only, not modified
  382. * %r11 is the data offset value
  383. */
  384. .macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
  385. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  386. movdqa \XMM1, \XMM5
  387. movdqa \XMM2, \XMM6
  388. movdqa \XMM3, \XMM7
  389. movdqa \XMM4, \XMM8
  390. # multiply TMP5 * HashKey using karatsuba
  391. movdqa \XMM5, \TMP4
  392. pshufd $78, \XMM5, \TMP6
  393. pxor \XMM5, \TMP6
  394. paddd ONE(%rip), \XMM0 # INCR CNT
  395. movdqa HashKey_4(%rsp), \TMP5
  396. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  397. movdqa \XMM0, \XMM1
  398. paddd ONE(%rip), \XMM0 # INCR CNT
  399. movdqa \XMM0, \XMM2
  400. paddd ONE(%rip), \XMM0 # INCR CNT
  401. movdqa \XMM0, \XMM3
  402. paddd ONE(%rip), \XMM0 # INCR CNT
  403. movdqa \XMM0, \XMM4
  404. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  405. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  406. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  407. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  408. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  409. pxor (%arg1), \XMM1
  410. pxor (%arg1), \XMM2
  411. pxor (%arg1), \XMM3
  412. pxor (%arg1), \XMM4
  413. movdqa HashKey_4_k(%rsp), \TMP5
  414. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  415. movaps 0x10(%arg1), \TMP1
  416. AESENC \TMP1, \XMM1 # Round 1
  417. AESENC \TMP1, \XMM2
  418. AESENC \TMP1, \XMM3
  419. AESENC \TMP1, \XMM4
  420. movaps 0x20(%arg1), \TMP1
  421. AESENC \TMP1, \XMM1 # Round 2
  422. AESENC \TMP1, \XMM2
  423. AESENC \TMP1, \XMM3
  424. AESENC \TMP1, \XMM4
  425. movdqa \XMM6, \TMP1
  426. pshufd $78, \XMM6, \TMP2
  427. pxor \XMM6, \TMP2
  428. movdqa HashKey_3(%rsp), \TMP5
  429. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  430. movaps 0x30(%arg1), \TMP3
  431. AESENC \TMP3, \XMM1 # Round 3
  432. AESENC \TMP3, \XMM2
  433. AESENC \TMP3, \XMM3
  434. AESENC \TMP3, \XMM4
  435. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  436. movaps 0x40(%arg1), \TMP3
  437. AESENC \TMP3, \XMM1 # Round 4
  438. AESENC \TMP3, \XMM2
  439. AESENC \TMP3, \XMM3
  440. AESENC \TMP3, \XMM4
  441. movdqa HashKey_3_k(%rsp), \TMP5
  442. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  443. movaps 0x50(%arg1), \TMP3
  444. AESENC \TMP3, \XMM1 # Round 5
  445. AESENC \TMP3, \XMM2
  446. AESENC \TMP3, \XMM3
  447. AESENC \TMP3, \XMM4
  448. pxor \TMP1, \TMP4
  449. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  450. pxor \XMM6, \XMM5
  451. pxor \TMP2, \TMP6
  452. movdqa \XMM7, \TMP1
  453. pshufd $78, \XMM7, \TMP2
  454. pxor \XMM7, \TMP2
  455. movdqa HashKey_2(%rsp ), \TMP5
  456. # Multiply TMP5 * HashKey using karatsuba
  457. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  458. movaps 0x60(%arg1), \TMP3
  459. AESENC \TMP3, \XMM1 # Round 6
  460. AESENC \TMP3, \XMM2
  461. AESENC \TMP3, \XMM3
  462. AESENC \TMP3, \XMM4
  463. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  464. movaps 0x70(%arg1), \TMP3
  465. AESENC \TMP3, \XMM1 # Round 7
  466. AESENC \TMP3, \XMM2
  467. AESENC \TMP3, \XMM3
  468. AESENC \TMP3, \XMM4
  469. movdqa HashKey_2_k(%rsp), \TMP5
  470. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  471. movaps 0x80(%arg1), \TMP3
  472. AESENC \TMP3, \XMM1 # Round 8
  473. AESENC \TMP3, \XMM2
  474. AESENC \TMP3, \XMM3
  475. AESENC \TMP3, \XMM4
  476. pxor \TMP1, \TMP4
  477. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  478. pxor \XMM7, \XMM5
  479. pxor \TMP2, \TMP6
  480. # Multiply XMM8 * HashKey
  481. # XMM8 and TMP5 hold the values for the two operands
  482. movdqa \XMM8, \TMP1
  483. pshufd $78, \XMM8, \TMP2
  484. pxor \XMM8, \TMP2
  485. movdqa HashKey(%rsp), \TMP5
  486. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  487. movaps 0x90(%arg1), \TMP3
  488. AESENC \TMP3, \XMM1 # Round 9
  489. AESENC \TMP3, \XMM2
  490. AESENC \TMP3, \XMM3
  491. AESENC \TMP3, \XMM4
  492. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  493. movaps 0xa0(%arg1), \TMP3
  494. AESENCLAST \TMP3, \XMM1 # Round 10
  495. AESENCLAST \TMP3, \XMM2
  496. AESENCLAST \TMP3, \XMM3
  497. AESENCLAST \TMP3, \XMM4
  498. movdqa HashKey_k(%rsp), \TMP5
  499. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  500. movdqu (%arg3,%r11,1), \TMP3
  501. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  502. .if \operation == dec
  503. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  504. movdqa \TMP3, \XMM1
  505. .endif
  506. movdqu 16(%arg3,%r11,1), \TMP3
  507. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  508. .if \operation == dec
  509. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  510. movdqa \TMP3, \XMM2
  511. .endif
  512. movdqu 32(%arg3,%r11,1), \TMP3
  513. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  514. .if \operation == dec
  515. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  516. movdqa \TMP3, \XMM3
  517. .endif
  518. movdqu 48(%arg3,%r11,1), \TMP3
  519. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  520. .if \operation == dec
  521. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  522. movdqa \TMP3, \XMM4
  523. .else
  524. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  525. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  526. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  527. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  528. .endif
  529. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  530. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  531. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  532. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
  533. pxor \TMP4, \TMP1
  534. pxor \XMM8, \XMM5
  535. pxor \TMP6, \TMP2
  536. pxor \TMP1, \TMP2
  537. pxor \XMM5, \TMP2
  538. movdqa \TMP2, \TMP3
  539. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  540. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  541. pxor \TMP3, \XMM5
  542. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  543. # first phase of reduction
  544. movdqa \XMM5, \TMP2
  545. movdqa \XMM5, \TMP3
  546. movdqa \XMM5, \TMP4
  547. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  548. pslld $31, \TMP2 # packed right shift << 31
  549. pslld $30, \TMP3 # packed right shift << 30
  550. pslld $25, \TMP4 # packed right shift << 25
  551. pxor \TMP3, \TMP2 # xor the shifted versions
  552. pxor \TMP4, \TMP2
  553. movdqa \TMP2, \TMP5
  554. psrldq $4, \TMP5 # right shift T5 1 DW
  555. pslldq $12, \TMP2 # left shift T2 3 DWs
  556. pxor \TMP2, \XMM5
  557. # second phase of reduction
  558. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  559. movdqa \XMM5,\TMP3
  560. movdqa \XMM5,\TMP4
  561. psrld $1, \TMP2 # packed left shift >>1
  562. psrld $2, \TMP3 # packed left shift >>2
  563. psrld $7, \TMP4 # packed left shift >>7
  564. pxor \TMP3,\TMP2 # xor the shifted versions
  565. pxor \TMP4,\TMP2
  566. pxor \TMP5, \TMP2
  567. pxor \TMP2, \XMM5
  568. pxor \TMP1, \XMM5 # result is in TMP1
  569. pxor \XMM5, \XMM1
  570. .endm
  571. /* GHASH the last 4 ciphertext blocks. */
  572. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  573. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  574. # Multiply TMP6 * HashKey (using Karatsuba)
  575. movdqa \XMM1, \TMP6
  576. pshufd $78, \XMM1, \TMP2
  577. pxor \XMM1, \TMP2
  578. movdqa HashKey_4(%rsp), \TMP5
  579. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  580. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  581. movdqa HashKey_4_k(%rsp), \TMP4
  582. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  583. movdqa \XMM1, \XMMDst
  584. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  585. # Multiply TMP1 * HashKey (using Karatsuba)
  586. movdqa \XMM2, \TMP1
  587. pshufd $78, \XMM2, \TMP2
  588. pxor \XMM2, \TMP2
  589. movdqa HashKey_3(%rsp), \TMP5
  590. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  591. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  592. movdqa HashKey_3_k(%rsp), \TMP4
  593. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  594. pxor \TMP1, \TMP6
  595. pxor \XMM2, \XMMDst
  596. pxor \TMP2, \XMM1
  597. # results accumulated in TMP6, XMMDst, XMM1
  598. # Multiply TMP1 * HashKey (using Karatsuba)
  599. movdqa \XMM3, \TMP1
  600. pshufd $78, \XMM3, \TMP2
  601. pxor \XMM3, \TMP2
  602. movdqa HashKey_2(%rsp), \TMP5
  603. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  604. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  605. movdqa HashKey_2_k(%rsp), \TMP4
  606. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  607. pxor \TMP1, \TMP6
  608. pxor \XMM3, \XMMDst
  609. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  610. # Multiply TMP1 * HashKey (using Karatsuba)
  611. movdqa \XMM4, \TMP1
  612. pshufd $78, \XMM4, \TMP2
  613. pxor \XMM4, \TMP2
  614. movdqa HashKey(%rsp), \TMP5
  615. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  616. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  617. movdqa HashKey_k(%rsp), \TMP4
  618. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  619. pxor \TMP1, \TMP6
  620. pxor \XMM4, \XMMDst
  621. pxor \XMM1, \TMP2
  622. pxor \TMP6, \TMP2
  623. pxor \XMMDst, \TMP2
  624. # middle section of the temp results combined as in karatsuba algorithm
  625. movdqa \TMP2, \TMP4
  626. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  627. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  628. pxor \TMP4, \XMMDst
  629. pxor \TMP2, \TMP6
  630. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  631. # first phase of the reduction
  632. movdqa \XMMDst, \TMP2
  633. movdqa \XMMDst, \TMP3
  634. movdqa \XMMDst, \TMP4
  635. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  636. pslld $31, \TMP2 # packed right shifting << 31
  637. pslld $30, \TMP3 # packed right shifting << 30
  638. pslld $25, \TMP4 # packed right shifting << 25
  639. pxor \TMP3, \TMP2 # xor the shifted versions
  640. pxor \TMP4, \TMP2
  641. movdqa \TMP2, \TMP7
  642. psrldq $4, \TMP7 # right shift TMP7 1 DW
  643. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  644. pxor \TMP2, \XMMDst
  645. # second phase of the reduction
  646. movdqa \XMMDst, \TMP2
  647. # make 3 copies of XMMDst for doing 3 shift operations
  648. movdqa \XMMDst, \TMP3
  649. movdqa \XMMDst, \TMP4
  650. psrld $1, \TMP2 # packed left shift >> 1
  651. psrld $2, \TMP3 # packed left shift >> 2
  652. psrld $7, \TMP4 # packed left shift >> 7
  653. pxor \TMP3, \TMP2 # xor the shifted versions
  654. pxor \TMP4, \TMP2
  655. pxor \TMP7, \TMP2
  656. pxor \TMP2, \XMMDst
  657. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  658. .endm
  659. /* Encryption of a single block done*/
  660. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  661. pxor (%arg1), \XMM0
  662. movaps 16(%arg1), \TMP1
  663. AESENC \TMP1, \XMM0
  664. movaps 32(%arg1), \TMP1
  665. AESENC \TMP1, \XMM0
  666. movaps 48(%arg1), \TMP1
  667. AESENC \TMP1, \XMM0
  668. movaps 64(%arg1), \TMP1
  669. AESENC \TMP1, \XMM0
  670. movaps 80(%arg1), \TMP1
  671. AESENC \TMP1, \XMM0
  672. movaps 96(%arg1), \TMP1
  673. AESENC \TMP1, \XMM0
  674. movaps 112(%arg1), \TMP1
  675. AESENC \TMP1, \XMM0
  676. movaps 128(%arg1), \TMP1
  677. AESENC \TMP1, \XMM0
  678. movaps 144(%arg1), \TMP1
  679. AESENC \TMP1, \XMM0
  680. movaps 160(%arg1), \TMP1
  681. AESENCLAST \TMP1, \XMM0
  682. .endm
  683. /*****************************************************************************
  684. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  685. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  686. * const u8 *in, // Ciphertext input
  687. * u64 plaintext_len, // Length of data in bytes for decryption.
  688. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  689. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  690. * // concatenated with 0x00000001. 16-byte aligned pointer.
  691. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  692. * const u8 *aad, // Additional Authentication Data (AAD)
  693. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  694. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  695. * // given authentication tag and only return the plaintext if they match.
  696. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  697. * // (most likely), 12 or 8.
  698. *
  699. * Assumptions:
  700. *
  701. * keys:
  702. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  703. * set of 11 keys in the data structure void *aes_ctx
  704. *
  705. * iv:
  706. * 0 1 2 3
  707. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  708. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  709. * | Salt (From the SA) |
  710. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  711. * | Initialization Vector |
  712. * | (This is the sequence number from IPSec header) |
  713. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  714. * | 0x1 |
  715. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  716. *
  717. *
  718. *
  719. * AAD:
  720. * AAD padded to 128 bits with 0
  721. * for example, assume AAD is a u32 vector
  722. *
  723. * if AAD is 8 bytes:
  724. * AAD[3] = {A0, A1};
  725. * padded AAD in xmm register = {A1 A0 0 0}
  726. *
  727. * 0 1 2 3
  728. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  729. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  730. * | SPI (A1) |
  731. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  732. * | 32-bit Sequence Number (A0) |
  733. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  734. * | 0x0 |
  735. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  736. *
  737. * AAD Format with 32-bit Sequence Number
  738. *
  739. * if AAD is 12 bytes:
  740. * AAD[3] = {A0, A1, A2};
  741. * padded AAD in xmm register = {A2 A1 A0 0}
  742. *
  743. * 0 1 2 3
  744. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  745. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  746. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  747. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  748. * | SPI (A2) |
  749. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  750. * | 64-bit Extended Sequence Number {A1,A0} |
  751. * | |
  752. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  753. * | 0x0 |
  754. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  755. *
  756. * AAD Format with 64-bit Extended Sequence Number
  757. *
  758. * aadLen:
  759. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  760. * The code supports 16 too but for other sizes, the code will fail.
  761. *
  762. * TLen:
  763. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  764. * For other sizes, the code will fail.
  765. *
  766. * poly = x^128 + x^127 + x^126 + x^121 + 1
  767. *
  768. *****************************************************************************/
  769. ENTRY(aesni_gcm_dec)
  770. push %r12
  771. push %r13
  772. push %r14
  773. mov %rsp, %r14
  774. /*
  775. * states of %xmm registers %xmm6:%xmm15 not saved
  776. * all %xmm registers are clobbered
  777. */
  778. sub $VARIABLE_OFFSET, %rsp
  779. and $~63, %rsp # align rsp to 64 bytes
  780. mov %arg6, %r12
  781. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  782. pshufb SHUF_MASK(%rip), %xmm13
  783. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  784. movdqa %xmm13, %xmm2
  785. psllq $1, %xmm13
  786. psrlq $63, %xmm2
  787. movdqa %xmm2, %xmm1
  788. pslldq $8, %xmm2
  789. psrldq $8, %xmm1
  790. por %xmm2, %xmm13
  791. # Reduction
  792. pshufd $0x24, %xmm1, %xmm2
  793. pcmpeqd TWOONE(%rip), %xmm2
  794. pand POLY(%rip), %xmm2
  795. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  796. # Decrypt first few blocks
  797. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  798. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  799. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  800. mov %r13, %r12
  801. and $(3<<4), %r12
  802. jz _initial_num_blocks_is_0_decrypt
  803. cmp $(2<<4), %r12
  804. jb _initial_num_blocks_is_1_decrypt
  805. je _initial_num_blocks_is_2_decrypt
  806. _initial_num_blocks_is_3_decrypt:
  807. INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  808. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  809. sub $48, %r13
  810. jmp _initial_blocks_decrypted
  811. _initial_num_blocks_is_2_decrypt:
  812. INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  813. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  814. sub $32, %r13
  815. jmp _initial_blocks_decrypted
  816. _initial_num_blocks_is_1_decrypt:
  817. INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  818. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  819. sub $16, %r13
  820. jmp _initial_blocks_decrypted
  821. _initial_num_blocks_is_0_decrypt:
  822. INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  823. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  824. _initial_blocks_decrypted:
  825. cmp $0, %r13
  826. je _zero_cipher_left_decrypt
  827. sub $64, %r13
  828. je _four_cipher_left_decrypt
  829. _decrypt_by_4:
  830. GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  831. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  832. add $64, %r11
  833. sub $64, %r13
  834. jne _decrypt_by_4
  835. _four_cipher_left_decrypt:
  836. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  837. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  838. _zero_cipher_left_decrypt:
  839. mov %arg4, %r13
  840. and $15, %r13 # %r13 = arg4 (mod 16)
  841. je _multiple_of_16_bytes_decrypt
  842. # Handle the last <16 byte block seperately
  843. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  844. pshufb SHUF_MASK(%rip), %xmm0
  845. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  846. sub $16, %r11
  847. add %r13, %r11
  848. movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
  849. lea SHIFT_MASK+16(%rip), %r12
  850. sub %r13, %r12
  851. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  852. # (%r13 is the number of bytes in plaintext mod 16)
  853. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  854. pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
  855. movdqa %xmm1, %xmm2
  856. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  857. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  858. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  859. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  860. pand %xmm1, %xmm2
  861. pshufb SHUF_MASK(%rip),%xmm2
  862. pxor %xmm2, %xmm8
  863. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  864. # GHASH computation for the last <16 byte block
  865. sub %r13, %r11
  866. add $16, %r11
  867. # output %r13 bytes
  868. movq %xmm0, %rax
  869. cmp $8, %r13
  870. jle _less_than_8_bytes_left_decrypt
  871. mov %rax, (%arg2 , %r11, 1)
  872. add $8, %r11
  873. psrldq $8, %xmm0
  874. movq %xmm0, %rax
  875. sub $8, %r13
  876. _less_than_8_bytes_left_decrypt:
  877. mov %al, (%arg2, %r11, 1)
  878. add $1, %r11
  879. shr $8, %rax
  880. sub $1, %r13
  881. jne _less_than_8_bytes_left_decrypt
  882. _multiple_of_16_bytes_decrypt:
  883. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  884. shl $3, %r12 # convert into number of bits
  885. movd %r12d, %xmm15 # len(A) in %xmm15
  886. shl $3, %arg4 # len(C) in bits (*128)
  887. movq %arg4, %xmm1
  888. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  889. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  890. pxor %xmm15, %xmm8
  891. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  892. # final GHASH computation
  893. pshufb SHUF_MASK(%rip), %xmm8
  894. mov %arg5, %rax # %rax = *Y0
  895. movdqu (%rax), %xmm0 # %xmm0 = Y0
  896. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  897. pxor %xmm8, %xmm0
  898. _return_T_decrypt:
  899. mov arg9, %r10 # %r10 = authTag
  900. mov arg10, %r11 # %r11 = auth_tag_len
  901. cmp $16, %r11
  902. je _T_16_decrypt
  903. cmp $12, %r11
  904. je _T_12_decrypt
  905. _T_8_decrypt:
  906. movq %xmm0, %rax
  907. mov %rax, (%r10)
  908. jmp _return_T_done_decrypt
  909. _T_12_decrypt:
  910. movq %xmm0, %rax
  911. mov %rax, (%r10)
  912. psrldq $8, %xmm0
  913. movd %xmm0, %eax
  914. mov %eax, 8(%r10)
  915. jmp _return_T_done_decrypt
  916. _T_16_decrypt:
  917. movdqu %xmm0, (%r10)
  918. _return_T_done_decrypt:
  919. mov %r14, %rsp
  920. pop %r14
  921. pop %r13
  922. pop %r12
  923. ret
  924. /*****************************************************************************
  925. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  926. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  927. * const u8 *in, // Plaintext input
  928. * u64 plaintext_len, // Length of data in bytes for encryption.
  929. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  930. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  931. * // concatenated with 0x00000001. 16-byte aligned pointer.
  932. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  933. * const u8 *aad, // Additional Authentication Data (AAD)
  934. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  935. * u8 *auth_tag, // Authenticated Tag output.
  936. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  937. * // 12 or 8.
  938. *
  939. * Assumptions:
  940. *
  941. * keys:
  942. * keys are pre-expanded and aligned to 16 bytes. we are using the
  943. * first set of 11 keys in the data structure void *aes_ctx
  944. *
  945. *
  946. * iv:
  947. * 0 1 2 3
  948. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  949. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  950. * | Salt (From the SA) |
  951. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  952. * | Initialization Vector |
  953. * | (This is the sequence number from IPSec header) |
  954. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  955. * | 0x1 |
  956. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  957. *
  958. *
  959. *
  960. * AAD:
  961. * AAD padded to 128 bits with 0
  962. * for example, assume AAD is a u32 vector
  963. *
  964. * if AAD is 8 bytes:
  965. * AAD[3] = {A0, A1};
  966. * padded AAD in xmm register = {A1 A0 0 0}
  967. *
  968. * 0 1 2 3
  969. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  970. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  971. * | SPI (A1) |
  972. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  973. * | 32-bit Sequence Number (A0) |
  974. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  975. * | 0x0 |
  976. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  977. *
  978. * AAD Format with 32-bit Sequence Number
  979. *
  980. * if AAD is 12 bytes:
  981. * AAD[3] = {A0, A1, A2};
  982. * padded AAD in xmm register = {A2 A1 A0 0}
  983. *
  984. * 0 1 2 3
  985. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  986. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  987. * | SPI (A2) |
  988. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  989. * | 64-bit Extended Sequence Number {A1,A0} |
  990. * | |
  991. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  992. * | 0x0 |
  993. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  994. *
  995. * AAD Format with 64-bit Extended Sequence Number
  996. *
  997. * aadLen:
  998. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  999. * The code supports 16 too but for other sizes, the code will fail.
  1000. *
  1001. * TLen:
  1002. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1003. * For other sizes, the code will fail.
  1004. *
  1005. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1006. ***************************************************************************/
  1007. ENTRY(aesni_gcm_enc)
  1008. push %r12
  1009. push %r13
  1010. push %r14
  1011. mov %rsp, %r14
  1012. #
  1013. # states of %xmm registers %xmm6:%xmm15 not saved
  1014. # all %xmm registers are clobbered
  1015. #
  1016. sub $VARIABLE_OFFSET, %rsp
  1017. and $~63, %rsp
  1018. mov %arg6, %r12
  1019. movdqu (%r12), %xmm13
  1020. pshufb SHUF_MASK(%rip), %xmm13
  1021. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1022. movdqa %xmm13, %xmm2
  1023. psllq $1, %xmm13
  1024. psrlq $63, %xmm2
  1025. movdqa %xmm2, %xmm1
  1026. pslldq $8, %xmm2
  1027. psrldq $8, %xmm1
  1028. por %xmm2, %xmm13
  1029. # reduce HashKey<<1
  1030. pshufd $0x24, %xmm1, %xmm2
  1031. pcmpeqd TWOONE(%rip), %xmm2
  1032. pand POLY(%rip), %xmm2
  1033. pxor %xmm2, %xmm13
  1034. movdqa %xmm13, HashKey(%rsp)
  1035. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1036. and $-16, %r13
  1037. mov %r13, %r12
  1038. # Encrypt first few blocks
  1039. and $(3<<4), %r12
  1040. jz _initial_num_blocks_is_0_encrypt
  1041. cmp $(2<<4), %r12
  1042. jb _initial_num_blocks_is_1_encrypt
  1043. je _initial_num_blocks_is_2_encrypt
  1044. _initial_num_blocks_is_3_encrypt:
  1045. INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1046. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1047. sub $48, %r13
  1048. jmp _initial_blocks_encrypted
  1049. _initial_num_blocks_is_2_encrypt:
  1050. INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1051. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1052. sub $32, %r13
  1053. jmp _initial_blocks_encrypted
  1054. _initial_num_blocks_is_1_encrypt:
  1055. INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1056. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1057. sub $16, %r13
  1058. jmp _initial_blocks_encrypted
  1059. _initial_num_blocks_is_0_encrypt:
  1060. INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1061. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1062. _initial_blocks_encrypted:
  1063. # Main loop - Encrypt remaining blocks
  1064. cmp $0, %r13
  1065. je _zero_cipher_left_encrypt
  1066. sub $64, %r13
  1067. je _four_cipher_left_encrypt
  1068. _encrypt_by_4_encrypt:
  1069. GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1070. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1071. add $64, %r11
  1072. sub $64, %r13
  1073. jne _encrypt_by_4_encrypt
  1074. _four_cipher_left_encrypt:
  1075. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1076. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1077. _zero_cipher_left_encrypt:
  1078. mov %arg4, %r13
  1079. and $15, %r13 # %r13 = arg4 (mod 16)
  1080. je _multiple_of_16_bytes_encrypt
  1081. # Handle the last <16 Byte block seperately
  1082. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1083. pshufb SHUF_MASK(%rip), %xmm0
  1084. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1085. sub $16, %r11
  1086. add %r13, %r11
  1087. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1088. lea SHIFT_MASK+16(%rip), %r12
  1089. sub %r13, %r12
  1090. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1091. # (%r13 is the number of bytes in plaintext mod 16)
  1092. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1093. pshufb %xmm2, %xmm1 # shift right 16-r13 byte
  1094. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1095. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1096. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1097. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1098. pshufb SHUF_MASK(%rip),%xmm0
  1099. pxor %xmm0, %xmm8
  1100. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1101. # GHASH computation for the last <16 byte block
  1102. sub %r13, %r11
  1103. add $16, %r11
  1104. pshufb SHUF_MASK(%rip), %xmm0
  1105. # shuffle xmm0 back to output as ciphertext
  1106. # Output %r13 bytes
  1107. movq %xmm0, %rax
  1108. cmp $8, %r13
  1109. jle _less_than_8_bytes_left_encrypt
  1110. mov %rax, (%arg2 , %r11, 1)
  1111. add $8, %r11
  1112. psrldq $8, %xmm0
  1113. movq %xmm0, %rax
  1114. sub $8, %r13
  1115. _less_than_8_bytes_left_encrypt:
  1116. mov %al, (%arg2, %r11, 1)
  1117. add $1, %r11
  1118. shr $8, %rax
  1119. sub $1, %r13
  1120. jne _less_than_8_bytes_left_encrypt
  1121. _multiple_of_16_bytes_encrypt:
  1122. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1123. shl $3, %r12
  1124. movd %r12d, %xmm15 # len(A) in %xmm15
  1125. shl $3, %arg4 # len(C) in bits (*128)
  1126. movq %arg4, %xmm1
  1127. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1128. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1129. pxor %xmm15, %xmm8
  1130. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1131. # final GHASH computation
  1132. pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
  1133. mov %arg5, %rax # %rax = *Y0
  1134. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1135. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1136. pxor %xmm8, %xmm0
  1137. _return_T_encrypt:
  1138. mov arg9, %r10 # %r10 = authTag
  1139. mov arg10, %r11 # %r11 = auth_tag_len
  1140. cmp $16, %r11
  1141. je _T_16_encrypt
  1142. cmp $12, %r11
  1143. je _T_12_encrypt
  1144. _T_8_encrypt:
  1145. movq %xmm0, %rax
  1146. mov %rax, (%r10)
  1147. jmp _return_T_done_encrypt
  1148. _T_12_encrypt:
  1149. movq %xmm0, %rax
  1150. mov %rax, (%r10)
  1151. psrldq $8, %xmm0
  1152. movd %xmm0, %eax
  1153. mov %eax, 8(%r10)
  1154. jmp _return_T_done_encrypt
  1155. _T_16_encrypt:
  1156. movdqu %xmm0, (%r10)
  1157. _return_T_done_encrypt:
  1158. mov %r14, %rsp
  1159. pop %r14
  1160. pop %r13
  1161. pop %r12
  1162. ret
  1163. _key_expansion_128:
  1164. _key_expansion_256a:
  1165. pshufd $0b11111111, %xmm1, %xmm1
  1166. shufps $0b00010000, %xmm0, %xmm4
  1167. pxor %xmm4, %xmm0
  1168. shufps $0b10001100, %xmm0, %xmm4
  1169. pxor %xmm4, %xmm0
  1170. pxor %xmm1, %xmm0
  1171. movaps %xmm0, (%rcx)
  1172. add $0x10, %rcx
  1173. ret
  1174. _key_expansion_192a:
  1175. pshufd $0b01010101, %xmm1, %xmm1
  1176. shufps $0b00010000, %xmm0, %xmm4
  1177. pxor %xmm4, %xmm0
  1178. shufps $0b10001100, %xmm0, %xmm4
  1179. pxor %xmm4, %xmm0
  1180. pxor %xmm1, %xmm0
  1181. movaps %xmm2, %xmm5
  1182. movaps %xmm2, %xmm6
  1183. pslldq $4, %xmm5
  1184. pshufd $0b11111111, %xmm0, %xmm3
  1185. pxor %xmm3, %xmm2
  1186. pxor %xmm5, %xmm2
  1187. movaps %xmm0, %xmm1
  1188. shufps $0b01000100, %xmm0, %xmm6
  1189. movaps %xmm6, (%rcx)
  1190. shufps $0b01001110, %xmm2, %xmm1
  1191. movaps %xmm1, 16(%rcx)
  1192. add $0x20, %rcx
  1193. ret
  1194. _key_expansion_192b:
  1195. pshufd $0b01010101, %xmm1, %xmm1
  1196. shufps $0b00010000, %xmm0, %xmm4
  1197. pxor %xmm4, %xmm0
  1198. shufps $0b10001100, %xmm0, %xmm4
  1199. pxor %xmm4, %xmm0
  1200. pxor %xmm1, %xmm0
  1201. movaps %xmm2, %xmm5
  1202. pslldq $4, %xmm5
  1203. pshufd $0b11111111, %xmm0, %xmm3
  1204. pxor %xmm3, %xmm2
  1205. pxor %xmm5, %xmm2
  1206. movaps %xmm0, (%rcx)
  1207. add $0x10, %rcx
  1208. ret
  1209. _key_expansion_256b:
  1210. pshufd $0b10101010, %xmm1, %xmm1
  1211. shufps $0b00010000, %xmm2, %xmm4
  1212. pxor %xmm4, %xmm2
  1213. shufps $0b10001100, %xmm2, %xmm4
  1214. pxor %xmm4, %xmm2
  1215. pxor %xmm1, %xmm2
  1216. movaps %xmm2, (%rcx)
  1217. add $0x10, %rcx
  1218. ret
  1219. /*
  1220. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1221. * unsigned int key_len)
  1222. */
  1223. ENTRY(aesni_set_key)
  1224. movups (%rsi), %xmm0 # user key (first 16 bytes)
  1225. movaps %xmm0, (%rdi)
  1226. lea 0x10(%rdi), %rcx # key addr
  1227. movl %edx, 480(%rdi)
  1228. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1229. cmp $24, %dl
  1230. jb .Lenc_key128
  1231. je .Lenc_key192
  1232. movups 0x10(%rsi), %xmm2 # other user key
  1233. movaps %xmm2, (%rcx)
  1234. add $0x10, %rcx
  1235. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1236. call _key_expansion_256a
  1237. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1238. call _key_expansion_256b
  1239. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1240. call _key_expansion_256a
  1241. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1242. call _key_expansion_256b
  1243. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1244. call _key_expansion_256a
  1245. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1246. call _key_expansion_256b
  1247. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1248. call _key_expansion_256a
  1249. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1250. call _key_expansion_256b
  1251. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1252. call _key_expansion_256a
  1253. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1254. call _key_expansion_256b
  1255. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1256. call _key_expansion_256a
  1257. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1258. call _key_expansion_256b
  1259. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1260. call _key_expansion_256a
  1261. jmp .Ldec_key
  1262. .Lenc_key192:
  1263. movq 0x10(%rsi), %xmm2 # other user key
  1264. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1265. call _key_expansion_192a
  1266. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1267. call _key_expansion_192b
  1268. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1269. call _key_expansion_192a
  1270. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1271. call _key_expansion_192b
  1272. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1273. call _key_expansion_192a
  1274. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1275. call _key_expansion_192b
  1276. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1277. call _key_expansion_192a
  1278. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1279. call _key_expansion_192b
  1280. jmp .Ldec_key
  1281. .Lenc_key128:
  1282. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1283. call _key_expansion_128
  1284. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1285. call _key_expansion_128
  1286. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1287. call _key_expansion_128
  1288. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1289. call _key_expansion_128
  1290. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1291. call _key_expansion_128
  1292. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1293. call _key_expansion_128
  1294. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1295. call _key_expansion_128
  1296. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1297. call _key_expansion_128
  1298. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1299. call _key_expansion_128
  1300. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1301. call _key_expansion_128
  1302. .Ldec_key:
  1303. sub $0x10, %rcx
  1304. movaps (%rdi), %xmm0
  1305. movaps (%rcx), %xmm1
  1306. movaps %xmm0, 240(%rcx)
  1307. movaps %xmm1, 240(%rdi)
  1308. add $0x10, %rdi
  1309. lea 240-16(%rcx), %rsi
  1310. .align 4
  1311. .Ldec_key_loop:
  1312. movaps (%rdi), %xmm0
  1313. AESIMC %xmm0 %xmm1
  1314. movaps %xmm1, (%rsi)
  1315. add $0x10, %rdi
  1316. sub $0x10, %rsi
  1317. cmp %rcx, %rdi
  1318. jb .Ldec_key_loop
  1319. xor %rax, %rax
  1320. ret
  1321. /*
  1322. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1323. */
  1324. ENTRY(aesni_enc)
  1325. movl 480(KEYP), KLEN # key length
  1326. movups (INP), STATE # input
  1327. call _aesni_enc1
  1328. movups STATE, (OUTP) # output
  1329. ret
  1330. /*
  1331. * _aesni_enc1: internal ABI
  1332. * input:
  1333. * KEYP: key struct pointer
  1334. * KLEN: round count
  1335. * STATE: initial state (input)
  1336. * output:
  1337. * STATE: finial state (output)
  1338. * changed:
  1339. * KEY
  1340. * TKEYP (T1)
  1341. */
  1342. _aesni_enc1:
  1343. movaps (KEYP), KEY # key
  1344. mov KEYP, TKEYP
  1345. pxor KEY, STATE # round 0
  1346. add $0x30, TKEYP
  1347. cmp $24, KLEN
  1348. jb .Lenc128
  1349. lea 0x20(TKEYP), TKEYP
  1350. je .Lenc192
  1351. add $0x20, TKEYP
  1352. movaps -0x60(TKEYP), KEY
  1353. AESENC KEY STATE
  1354. movaps -0x50(TKEYP), KEY
  1355. AESENC KEY STATE
  1356. .align 4
  1357. .Lenc192:
  1358. movaps -0x40(TKEYP), KEY
  1359. AESENC KEY STATE
  1360. movaps -0x30(TKEYP), KEY
  1361. AESENC KEY STATE
  1362. .align 4
  1363. .Lenc128:
  1364. movaps -0x20(TKEYP), KEY
  1365. AESENC KEY STATE
  1366. movaps -0x10(TKEYP), KEY
  1367. AESENC KEY STATE
  1368. movaps (TKEYP), KEY
  1369. AESENC KEY STATE
  1370. movaps 0x10(TKEYP), KEY
  1371. AESENC KEY STATE
  1372. movaps 0x20(TKEYP), KEY
  1373. AESENC KEY STATE
  1374. movaps 0x30(TKEYP), KEY
  1375. AESENC KEY STATE
  1376. movaps 0x40(TKEYP), KEY
  1377. AESENC KEY STATE
  1378. movaps 0x50(TKEYP), KEY
  1379. AESENC KEY STATE
  1380. movaps 0x60(TKEYP), KEY
  1381. AESENC KEY STATE
  1382. movaps 0x70(TKEYP), KEY
  1383. AESENCLAST KEY STATE
  1384. ret
  1385. /*
  1386. * _aesni_enc4: internal ABI
  1387. * input:
  1388. * KEYP: key struct pointer
  1389. * KLEN: round count
  1390. * STATE1: initial state (input)
  1391. * STATE2
  1392. * STATE3
  1393. * STATE4
  1394. * output:
  1395. * STATE1: finial state (output)
  1396. * STATE2
  1397. * STATE3
  1398. * STATE4
  1399. * changed:
  1400. * KEY
  1401. * TKEYP (T1)
  1402. */
  1403. _aesni_enc4:
  1404. movaps (KEYP), KEY # key
  1405. mov KEYP, TKEYP
  1406. pxor KEY, STATE1 # round 0
  1407. pxor KEY, STATE2
  1408. pxor KEY, STATE3
  1409. pxor KEY, STATE4
  1410. add $0x30, TKEYP
  1411. cmp $24, KLEN
  1412. jb .L4enc128
  1413. lea 0x20(TKEYP), TKEYP
  1414. je .L4enc192
  1415. add $0x20, TKEYP
  1416. movaps -0x60(TKEYP), KEY
  1417. AESENC KEY STATE1
  1418. AESENC KEY STATE2
  1419. AESENC KEY STATE3
  1420. AESENC KEY STATE4
  1421. movaps -0x50(TKEYP), KEY
  1422. AESENC KEY STATE1
  1423. AESENC KEY STATE2
  1424. AESENC KEY STATE3
  1425. AESENC KEY STATE4
  1426. #.align 4
  1427. .L4enc192:
  1428. movaps -0x40(TKEYP), KEY
  1429. AESENC KEY STATE1
  1430. AESENC KEY STATE2
  1431. AESENC KEY STATE3
  1432. AESENC KEY STATE4
  1433. movaps -0x30(TKEYP), KEY
  1434. AESENC KEY STATE1
  1435. AESENC KEY STATE2
  1436. AESENC KEY STATE3
  1437. AESENC KEY STATE4
  1438. #.align 4
  1439. .L4enc128:
  1440. movaps -0x20(TKEYP), KEY
  1441. AESENC KEY STATE1
  1442. AESENC KEY STATE2
  1443. AESENC KEY STATE3
  1444. AESENC KEY STATE4
  1445. movaps -0x10(TKEYP), KEY
  1446. AESENC KEY STATE1
  1447. AESENC KEY STATE2
  1448. AESENC KEY STATE3
  1449. AESENC KEY STATE4
  1450. movaps (TKEYP), KEY
  1451. AESENC KEY STATE1
  1452. AESENC KEY STATE2
  1453. AESENC KEY STATE3
  1454. AESENC KEY STATE4
  1455. movaps 0x10(TKEYP), KEY
  1456. AESENC KEY STATE1
  1457. AESENC KEY STATE2
  1458. AESENC KEY STATE3
  1459. AESENC KEY STATE4
  1460. movaps 0x20(TKEYP), KEY
  1461. AESENC KEY STATE1
  1462. AESENC KEY STATE2
  1463. AESENC KEY STATE3
  1464. AESENC KEY STATE4
  1465. movaps 0x30(TKEYP), KEY
  1466. AESENC KEY STATE1
  1467. AESENC KEY STATE2
  1468. AESENC KEY STATE3
  1469. AESENC KEY STATE4
  1470. movaps 0x40(TKEYP), KEY
  1471. AESENC KEY STATE1
  1472. AESENC KEY STATE2
  1473. AESENC KEY STATE3
  1474. AESENC KEY STATE4
  1475. movaps 0x50(TKEYP), KEY
  1476. AESENC KEY STATE1
  1477. AESENC KEY STATE2
  1478. AESENC KEY STATE3
  1479. AESENC KEY STATE4
  1480. movaps 0x60(TKEYP), KEY
  1481. AESENC KEY STATE1
  1482. AESENC KEY STATE2
  1483. AESENC KEY STATE3
  1484. AESENC KEY STATE4
  1485. movaps 0x70(TKEYP), KEY
  1486. AESENCLAST KEY STATE1 # last round
  1487. AESENCLAST KEY STATE2
  1488. AESENCLAST KEY STATE3
  1489. AESENCLAST KEY STATE4
  1490. ret
  1491. /*
  1492. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1493. */
  1494. ENTRY(aesni_dec)
  1495. mov 480(KEYP), KLEN # key length
  1496. add $240, KEYP
  1497. movups (INP), STATE # input
  1498. call _aesni_dec1
  1499. movups STATE, (OUTP) #output
  1500. ret
  1501. /*
  1502. * _aesni_dec1: internal ABI
  1503. * input:
  1504. * KEYP: key struct pointer
  1505. * KLEN: key length
  1506. * STATE: initial state (input)
  1507. * output:
  1508. * STATE: finial state (output)
  1509. * changed:
  1510. * KEY
  1511. * TKEYP (T1)
  1512. */
  1513. _aesni_dec1:
  1514. movaps (KEYP), KEY # key
  1515. mov KEYP, TKEYP
  1516. pxor KEY, STATE # round 0
  1517. add $0x30, TKEYP
  1518. cmp $24, KLEN
  1519. jb .Ldec128
  1520. lea 0x20(TKEYP), TKEYP
  1521. je .Ldec192
  1522. add $0x20, TKEYP
  1523. movaps -0x60(TKEYP), KEY
  1524. AESDEC KEY STATE
  1525. movaps -0x50(TKEYP), KEY
  1526. AESDEC KEY STATE
  1527. .align 4
  1528. .Ldec192:
  1529. movaps -0x40(TKEYP), KEY
  1530. AESDEC KEY STATE
  1531. movaps -0x30(TKEYP), KEY
  1532. AESDEC KEY STATE
  1533. .align 4
  1534. .Ldec128:
  1535. movaps -0x20(TKEYP), KEY
  1536. AESDEC KEY STATE
  1537. movaps -0x10(TKEYP), KEY
  1538. AESDEC KEY STATE
  1539. movaps (TKEYP), KEY
  1540. AESDEC KEY STATE
  1541. movaps 0x10(TKEYP), KEY
  1542. AESDEC KEY STATE
  1543. movaps 0x20(TKEYP), KEY
  1544. AESDEC KEY STATE
  1545. movaps 0x30(TKEYP), KEY
  1546. AESDEC KEY STATE
  1547. movaps 0x40(TKEYP), KEY
  1548. AESDEC KEY STATE
  1549. movaps 0x50(TKEYP), KEY
  1550. AESDEC KEY STATE
  1551. movaps 0x60(TKEYP), KEY
  1552. AESDEC KEY STATE
  1553. movaps 0x70(TKEYP), KEY
  1554. AESDECLAST KEY STATE
  1555. ret
  1556. /*
  1557. * _aesni_dec4: internal ABI
  1558. * input:
  1559. * KEYP: key struct pointer
  1560. * KLEN: key length
  1561. * STATE1: initial state (input)
  1562. * STATE2
  1563. * STATE3
  1564. * STATE4
  1565. * output:
  1566. * STATE1: finial state (output)
  1567. * STATE2
  1568. * STATE3
  1569. * STATE4
  1570. * changed:
  1571. * KEY
  1572. * TKEYP (T1)
  1573. */
  1574. _aesni_dec4:
  1575. movaps (KEYP), KEY # key
  1576. mov KEYP, TKEYP
  1577. pxor KEY, STATE1 # round 0
  1578. pxor KEY, STATE2
  1579. pxor KEY, STATE3
  1580. pxor KEY, STATE4
  1581. add $0x30, TKEYP
  1582. cmp $24, KLEN
  1583. jb .L4dec128
  1584. lea 0x20(TKEYP), TKEYP
  1585. je .L4dec192
  1586. add $0x20, TKEYP
  1587. movaps -0x60(TKEYP), KEY
  1588. AESDEC KEY STATE1
  1589. AESDEC KEY STATE2
  1590. AESDEC KEY STATE3
  1591. AESDEC KEY STATE4
  1592. movaps -0x50(TKEYP), KEY
  1593. AESDEC KEY STATE1
  1594. AESDEC KEY STATE2
  1595. AESDEC KEY STATE3
  1596. AESDEC KEY STATE4
  1597. .align 4
  1598. .L4dec192:
  1599. movaps -0x40(TKEYP), KEY
  1600. AESDEC KEY STATE1
  1601. AESDEC KEY STATE2
  1602. AESDEC KEY STATE3
  1603. AESDEC KEY STATE4
  1604. movaps -0x30(TKEYP), KEY
  1605. AESDEC KEY STATE1
  1606. AESDEC KEY STATE2
  1607. AESDEC KEY STATE3
  1608. AESDEC KEY STATE4
  1609. .align 4
  1610. .L4dec128:
  1611. movaps -0x20(TKEYP), KEY
  1612. AESDEC KEY STATE1
  1613. AESDEC KEY STATE2
  1614. AESDEC KEY STATE3
  1615. AESDEC KEY STATE4
  1616. movaps -0x10(TKEYP), KEY
  1617. AESDEC KEY STATE1
  1618. AESDEC KEY STATE2
  1619. AESDEC KEY STATE3
  1620. AESDEC KEY STATE4
  1621. movaps (TKEYP), KEY
  1622. AESDEC KEY STATE1
  1623. AESDEC KEY STATE2
  1624. AESDEC KEY STATE3
  1625. AESDEC KEY STATE4
  1626. movaps 0x10(TKEYP), KEY
  1627. AESDEC KEY STATE1
  1628. AESDEC KEY STATE2
  1629. AESDEC KEY STATE3
  1630. AESDEC KEY STATE4
  1631. movaps 0x20(TKEYP), KEY
  1632. AESDEC KEY STATE1
  1633. AESDEC KEY STATE2
  1634. AESDEC KEY STATE3
  1635. AESDEC KEY STATE4
  1636. movaps 0x30(TKEYP), KEY
  1637. AESDEC KEY STATE1
  1638. AESDEC KEY STATE2
  1639. AESDEC KEY STATE3
  1640. AESDEC KEY STATE4
  1641. movaps 0x40(TKEYP), KEY
  1642. AESDEC KEY STATE1
  1643. AESDEC KEY STATE2
  1644. AESDEC KEY STATE3
  1645. AESDEC KEY STATE4
  1646. movaps 0x50(TKEYP), KEY
  1647. AESDEC KEY STATE1
  1648. AESDEC KEY STATE2
  1649. AESDEC KEY STATE3
  1650. AESDEC KEY STATE4
  1651. movaps 0x60(TKEYP), KEY
  1652. AESDEC KEY STATE1
  1653. AESDEC KEY STATE2
  1654. AESDEC KEY STATE3
  1655. AESDEC KEY STATE4
  1656. movaps 0x70(TKEYP), KEY
  1657. AESDECLAST KEY STATE1 # last round
  1658. AESDECLAST KEY STATE2
  1659. AESDECLAST KEY STATE3
  1660. AESDECLAST KEY STATE4
  1661. ret
  1662. /*
  1663. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1664. * size_t len)
  1665. */
  1666. ENTRY(aesni_ecb_enc)
  1667. test LEN, LEN # check length
  1668. jz .Lecb_enc_ret
  1669. mov 480(KEYP), KLEN
  1670. cmp $16, LEN
  1671. jb .Lecb_enc_ret
  1672. cmp $64, LEN
  1673. jb .Lecb_enc_loop1
  1674. .align 4
  1675. .Lecb_enc_loop4:
  1676. movups (INP), STATE1
  1677. movups 0x10(INP), STATE2
  1678. movups 0x20(INP), STATE3
  1679. movups 0x30(INP), STATE4
  1680. call _aesni_enc4
  1681. movups STATE1, (OUTP)
  1682. movups STATE2, 0x10(OUTP)
  1683. movups STATE3, 0x20(OUTP)
  1684. movups STATE4, 0x30(OUTP)
  1685. sub $64, LEN
  1686. add $64, INP
  1687. add $64, OUTP
  1688. cmp $64, LEN
  1689. jge .Lecb_enc_loop4
  1690. cmp $16, LEN
  1691. jb .Lecb_enc_ret
  1692. .align 4
  1693. .Lecb_enc_loop1:
  1694. movups (INP), STATE1
  1695. call _aesni_enc1
  1696. movups STATE1, (OUTP)
  1697. sub $16, LEN
  1698. add $16, INP
  1699. add $16, OUTP
  1700. cmp $16, LEN
  1701. jge .Lecb_enc_loop1
  1702. .Lecb_enc_ret:
  1703. ret
  1704. /*
  1705. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1706. * size_t len);
  1707. */
  1708. ENTRY(aesni_ecb_dec)
  1709. test LEN, LEN
  1710. jz .Lecb_dec_ret
  1711. mov 480(KEYP), KLEN
  1712. add $240, KEYP
  1713. cmp $16, LEN
  1714. jb .Lecb_dec_ret
  1715. cmp $64, LEN
  1716. jb .Lecb_dec_loop1
  1717. .align 4
  1718. .Lecb_dec_loop4:
  1719. movups (INP), STATE1
  1720. movups 0x10(INP), STATE2
  1721. movups 0x20(INP), STATE3
  1722. movups 0x30(INP), STATE4
  1723. call _aesni_dec4
  1724. movups STATE1, (OUTP)
  1725. movups STATE2, 0x10(OUTP)
  1726. movups STATE3, 0x20(OUTP)
  1727. movups STATE4, 0x30(OUTP)
  1728. sub $64, LEN
  1729. add $64, INP
  1730. add $64, OUTP
  1731. cmp $64, LEN
  1732. jge .Lecb_dec_loop4
  1733. cmp $16, LEN
  1734. jb .Lecb_dec_ret
  1735. .align 4
  1736. .Lecb_dec_loop1:
  1737. movups (INP), STATE1
  1738. call _aesni_dec1
  1739. movups STATE1, (OUTP)
  1740. sub $16, LEN
  1741. add $16, INP
  1742. add $16, OUTP
  1743. cmp $16, LEN
  1744. jge .Lecb_dec_loop1
  1745. .Lecb_dec_ret:
  1746. ret
  1747. /*
  1748. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1749. * size_t len, u8 *iv)
  1750. */
  1751. ENTRY(aesni_cbc_enc)
  1752. cmp $16, LEN
  1753. jb .Lcbc_enc_ret
  1754. mov 480(KEYP), KLEN
  1755. movups (IVP), STATE # load iv as initial state
  1756. .align 4
  1757. .Lcbc_enc_loop:
  1758. movups (INP), IN # load input
  1759. pxor IN, STATE
  1760. call _aesni_enc1
  1761. movups STATE, (OUTP) # store output
  1762. sub $16, LEN
  1763. add $16, INP
  1764. add $16, OUTP
  1765. cmp $16, LEN
  1766. jge .Lcbc_enc_loop
  1767. movups STATE, (IVP)
  1768. .Lcbc_enc_ret:
  1769. ret
  1770. /*
  1771. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1772. * size_t len, u8 *iv)
  1773. */
  1774. ENTRY(aesni_cbc_dec)
  1775. cmp $16, LEN
  1776. jb .Lcbc_dec_just_ret
  1777. mov 480(KEYP), KLEN
  1778. add $240, KEYP
  1779. movups (IVP), IV
  1780. cmp $64, LEN
  1781. jb .Lcbc_dec_loop1
  1782. .align 4
  1783. .Lcbc_dec_loop4:
  1784. movups (INP), IN1
  1785. movaps IN1, STATE1
  1786. movups 0x10(INP), IN2
  1787. movaps IN2, STATE2
  1788. movups 0x20(INP), IN3
  1789. movaps IN3, STATE3
  1790. movups 0x30(INP), IN4
  1791. movaps IN4, STATE4
  1792. call _aesni_dec4
  1793. pxor IV, STATE1
  1794. pxor IN1, STATE2
  1795. pxor IN2, STATE3
  1796. pxor IN3, STATE4
  1797. movaps IN4, IV
  1798. movups STATE1, (OUTP)
  1799. movups STATE2, 0x10(OUTP)
  1800. movups STATE3, 0x20(OUTP)
  1801. movups STATE4, 0x30(OUTP)
  1802. sub $64, LEN
  1803. add $64, INP
  1804. add $64, OUTP
  1805. cmp $64, LEN
  1806. jge .Lcbc_dec_loop4
  1807. cmp $16, LEN
  1808. jb .Lcbc_dec_ret
  1809. .align 4
  1810. .Lcbc_dec_loop1:
  1811. movups (INP), IN
  1812. movaps IN, STATE
  1813. call _aesni_dec1
  1814. pxor IV, STATE
  1815. movups STATE, (OUTP)
  1816. movaps IN, IV
  1817. sub $16, LEN
  1818. add $16, INP
  1819. add $16, OUTP
  1820. cmp $16, LEN
  1821. jge .Lcbc_dec_loop1
  1822. .Lcbc_dec_ret:
  1823. movups IV, (IVP)
  1824. .Lcbc_dec_just_ret:
  1825. ret
  1826. .align 16
  1827. .Lbswap_mask:
  1828. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  1829. /*
  1830. * _aesni_inc_init: internal ABI
  1831. * setup registers used by _aesni_inc
  1832. * input:
  1833. * IV
  1834. * output:
  1835. * CTR: == IV, in little endian
  1836. * TCTR_LOW: == lower qword of CTR
  1837. * INC: == 1, in little endian
  1838. * BSWAP_MASK == endian swapping mask
  1839. */
  1840. _aesni_inc_init:
  1841. movaps .Lbswap_mask, BSWAP_MASK
  1842. movaps IV, CTR
  1843. PSHUFB_XMM BSWAP_MASK CTR
  1844. mov $1, TCTR_LOW
  1845. MOVQ_R64_XMM TCTR_LOW INC
  1846. MOVQ_R64_XMM CTR TCTR_LOW
  1847. ret
  1848. /*
  1849. * _aesni_inc: internal ABI
  1850. * Increase IV by 1, IV is in big endian
  1851. * input:
  1852. * IV
  1853. * CTR: == IV, in little endian
  1854. * TCTR_LOW: == lower qword of CTR
  1855. * INC: == 1, in little endian
  1856. * BSWAP_MASK == endian swapping mask
  1857. * output:
  1858. * IV: Increase by 1
  1859. * changed:
  1860. * CTR: == output IV, in little endian
  1861. * TCTR_LOW: == lower qword of CTR
  1862. */
  1863. _aesni_inc:
  1864. paddq INC, CTR
  1865. add $1, TCTR_LOW
  1866. jnc .Linc_low
  1867. pslldq $8, INC
  1868. paddq INC, CTR
  1869. psrldq $8, INC
  1870. .Linc_low:
  1871. movaps CTR, IV
  1872. PSHUFB_XMM BSWAP_MASK IV
  1873. ret
  1874. /*
  1875. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1876. * size_t len, u8 *iv)
  1877. */
  1878. ENTRY(aesni_ctr_enc)
  1879. cmp $16, LEN
  1880. jb .Lctr_enc_just_ret
  1881. mov 480(KEYP), KLEN
  1882. movups (IVP), IV
  1883. call _aesni_inc_init
  1884. cmp $64, LEN
  1885. jb .Lctr_enc_loop1
  1886. .align 4
  1887. .Lctr_enc_loop4:
  1888. movaps IV, STATE1
  1889. call _aesni_inc
  1890. movups (INP), IN1
  1891. movaps IV, STATE2
  1892. call _aesni_inc
  1893. movups 0x10(INP), IN2
  1894. movaps IV, STATE3
  1895. call _aesni_inc
  1896. movups 0x20(INP), IN3
  1897. movaps IV, STATE4
  1898. call _aesni_inc
  1899. movups 0x30(INP), IN4
  1900. call _aesni_enc4
  1901. pxor IN1, STATE1
  1902. movups STATE1, (OUTP)
  1903. pxor IN2, STATE2
  1904. movups STATE2, 0x10(OUTP)
  1905. pxor IN3, STATE3
  1906. movups STATE3, 0x20(OUTP)
  1907. pxor IN4, STATE4
  1908. movups STATE4, 0x30(OUTP)
  1909. sub $64, LEN
  1910. add $64, INP
  1911. add $64, OUTP
  1912. cmp $64, LEN
  1913. jge .Lctr_enc_loop4
  1914. cmp $16, LEN
  1915. jb .Lctr_enc_ret
  1916. .align 4
  1917. .Lctr_enc_loop1:
  1918. movaps IV, STATE
  1919. call _aesni_inc
  1920. movups (INP), IN
  1921. call _aesni_enc1
  1922. pxor IN, STATE
  1923. movups STATE, (OUTP)
  1924. sub $16, LEN
  1925. add $16, INP
  1926. add $16, OUTP
  1927. cmp $16, LEN
  1928. jge .Lctr_enc_loop1
  1929. .Lctr_enc_ret:
  1930. movups IV, (IVP)
  1931. .Lctr_enc_just_ret:
  1932. ret