aesni-intel_asm.S 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. #ifdef __x86_64__
  34. .data
  35. POLY: .octa 0xC2000000000000000000000000000001
  36. TWOONE: .octa 0x00000001000000000000000000000001
  37. # order of these constants should not change.
  38. # more specifically, ALL_F should follow SHIFT_MASK,
  39. # and ZERO should follow ALL_F
  40. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  41. MASK1: .octa 0x0000000000000000ffffffffffffffff
  42. MASK2: .octa 0xffffffffffffffff0000000000000000
  43. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  44. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  45. ZERO: .octa 0x00000000000000000000000000000000
  46. ONE: .octa 0x00000000000000000000000000000001
  47. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  48. dec: .octa 0x1
  49. enc: .octa 0x2
  50. .text
  51. #define STACK_OFFSET 8*3
  52. #define HashKey 16*0 // store HashKey <<1 mod poly here
  53. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  54. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  55. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  56. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  57. // bits of HashKey <<1 mod poly here
  58. //(for Karatsuba purposes)
  59. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  60. // bits of HashKey^2 <<1 mod poly here
  61. // (for Karatsuba purposes)
  62. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  63. // bits of HashKey^3 <<1 mod poly here
  64. // (for Karatsuba purposes)
  65. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  66. // bits of HashKey^4 <<1 mod poly here
  67. // (for Karatsuba purposes)
  68. #define VARIABLE_OFFSET 16*8
  69. #define arg1 rdi
  70. #define arg2 rsi
  71. #define arg3 rdx
  72. #define arg4 rcx
  73. #define arg5 r8
  74. #define arg6 r9
  75. #define arg7 STACK_OFFSET+8(%r14)
  76. #define arg8 STACK_OFFSET+16(%r14)
  77. #define arg9 STACK_OFFSET+24(%r14)
  78. #define arg10 STACK_OFFSET+32(%r14)
  79. #endif
  80. #define STATE1 %xmm0
  81. #define STATE2 %xmm4
  82. #define STATE3 %xmm5
  83. #define STATE4 %xmm6
  84. #define STATE STATE1
  85. #define IN1 %xmm1
  86. #define IN2 %xmm7
  87. #define IN3 %xmm8
  88. #define IN4 %xmm9
  89. #define IN IN1
  90. #define KEY %xmm2
  91. #define IV %xmm3
  92. #define BSWAP_MASK %xmm10
  93. #define CTR %xmm11
  94. #define INC %xmm12
  95. #ifdef __x86_64__
  96. #define AREG %rax
  97. #define KEYP %rdi
  98. #define OUTP %rsi
  99. #define UKEYP OUTP
  100. #define INP %rdx
  101. #define LEN %rcx
  102. #define IVP %r8
  103. #define KLEN %r9d
  104. #define T1 %r10
  105. #define TKEYP T1
  106. #define T2 %r11
  107. #define TCTR_LOW T2
  108. #else
  109. #define AREG %eax
  110. #define KEYP %edi
  111. #define OUTP AREG
  112. #define UKEYP OUTP
  113. #define INP %edx
  114. #define LEN %esi
  115. #define IVP %ebp
  116. #define KLEN %ebx
  117. #define T1 %ecx
  118. #define TKEYP T1
  119. #endif
  120. #ifdef __x86_64__
  121. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  122. *
  123. *
  124. * Input: A and B (128-bits each, bit-reflected)
  125. * Output: C = A*B*x mod poly, (i.e. >>1 )
  126. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  127. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  128. *
  129. */
  130. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  131. movdqa \GH, \TMP1
  132. pshufd $78, \GH, \TMP2
  133. pshufd $78, \HK, \TMP3
  134. pxor \GH, \TMP2 # TMP2 = a1+a0
  135. pxor \HK, \TMP3 # TMP3 = b1+b0
  136. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  137. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  138. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  139. pxor \GH, \TMP2
  140. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  141. movdqa \TMP2, \TMP3
  142. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  143. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  144. pxor \TMP3, \GH
  145. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  146. # first phase of the reduction
  147. movdqa \GH, \TMP2
  148. movdqa \GH, \TMP3
  149. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  150. # in in order to perform
  151. # independent shifts
  152. pslld $31, \TMP2 # packed right shift <<31
  153. pslld $30, \TMP3 # packed right shift <<30
  154. pslld $25, \TMP4 # packed right shift <<25
  155. pxor \TMP3, \TMP2 # xor the shifted versions
  156. pxor \TMP4, \TMP2
  157. movdqa \TMP2, \TMP5
  158. psrldq $4, \TMP5 # right shift TMP5 1 DW
  159. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  160. pxor \TMP2, \GH
  161. # second phase of the reduction
  162. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  163. # in in order to perform
  164. # independent shifts
  165. movdqa \GH,\TMP3
  166. movdqa \GH,\TMP4
  167. psrld $1,\TMP2 # packed left shift >>1
  168. psrld $2,\TMP3 # packed left shift >>2
  169. psrld $7,\TMP4 # packed left shift >>7
  170. pxor \TMP3,\TMP2 # xor the shifted versions
  171. pxor \TMP4,\TMP2
  172. pxor \TMP5, \TMP2
  173. pxor \TMP2, \GH
  174. pxor \TMP1, \GH # result is in TMP1
  175. .endm
  176. /*
  177. * if a = number of total plaintext bytes
  178. * b = floor(a/16)
  179. * num_initial_blocks = b mod 4
  180. * encrypt the initial num_initial_blocks blocks and apply ghash on
  181. * the ciphertext
  182. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  183. * are clobbered
  184. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  185. */
  186. .macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  187. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  188. mov arg7, %r10 # %r10 = AAD
  189. mov arg8, %r12 # %r12 = aadLen
  190. mov %r12, %r11
  191. pxor %xmm\i, %xmm\i
  192. _get_AAD_loop\num_initial_blocks\operation:
  193. movd (%r10), \TMP1
  194. pslldq $12, \TMP1
  195. psrldq $4, %xmm\i
  196. pxor \TMP1, %xmm\i
  197. add $4, %r10
  198. sub $4, %r12
  199. jne _get_AAD_loop\num_initial_blocks\operation
  200. cmp $16, %r11
  201. je _get_AAD_loop2_done\num_initial_blocks\operation
  202. mov $16, %r12
  203. _get_AAD_loop2\num_initial_blocks\operation:
  204. psrldq $4, %xmm\i
  205. sub $4, %r12
  206. cmp %r11, %r12
  207. jne _get_AAD_loop2\num_initial_blocks\operation
  208. _get_AAD_loop2_done\num_initial_blocks\operation:
  209. pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
  210. xor %r11, %r11 # initialise the data pointer offset as zero
  211. # start AES for num_initial_blocks blocks
  212. mov %arg5, %rax # %rax = *Y0
  213. movdqu (%rax), \XMM0 # XMM0 = Y0
  214. pshufb SHUF_MASK(%rip), \XMM0
  215. .if \i_seq != 0
  216. .irpc index, \i_seq
  217. paddd ONE(%rip), \XMM0 # INCR Y0
  218. movdqa \XMM0, %xmm\index
  219. pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
  220. .endr
  221. .irpc index, \i_seq
  222. pxor 16*0(%arg1), %xmm\index
  223. .endr
  224. .irpc index, \i_seq
  225. movaps 0x10(%rdi), \TMP1
  226. AESENC \TMP1, %xmm\index # Round 1
  227. .endr
  228. .irpc index, \i_seq
  229. movaps 0x20(%arg1), \TMP1
  230. AESENC \TMP1, %xmm\index # Round 2
  231. .endr
  232. .irpc index, \i_seq
  233. movaps 0x30(%arg1), \TMP1
  234. AESENC \TMP1, %xmm\index # Round 2
  235. .endr
  236. .irpc index, \i_seq
  237. movaps 0x40(%arg1), \TMP1
  238. AESENC \TMP1, %xmm\index # Round 2
  239. .endr
  240. .irpc index, \i_seq
  241. movaps 0x50(%arg1), \TMP1
  242. AESENC \TMP1, %xmm\index # Round 2
  243. .endr
  244. .irpc index, \i_seq
  245. movaps 0x60(%arg1), \TMP1
  246. AESENC \TMP1, %xmm\index # Round 2
  247. .endr
  248. .irpc index, \i_seq
  249. movaps 0x70(%arg1), \TMP1
  250. AESENC \TMP1, %xmm\index # Round 2
  251. .endr
  252. .irpc index, \i_seq
  253. movaps 0x80(%arg1), \TMP1
  254. AESENC \TMP1, %xmm\index # Round 2
  255. .endr
  256. .irpc index, \i_seq
  257. movaps 0x90(%arg1), \TMP1
  258. AESENC \TMP1, %xmm\index # Round 2
  259. .endr
  260. .irpc index, \i_seq
  261. movaps 0xa0(%arg1), \TMP1
  262. AESENCLAST \TMP1, %xmm\index # Round 10
  263. .endr
  264. .irpc index, \i_seq
  265. movdqu (%arg3 , %r11, 1), \TMP1
  266. pxor \TMP1, %xmm\index
  267. movdqu %xmm\index, (%arg2 , %r11, 1)
  268. # write back plaintext/ciphertext for num_initial_blocks
  269. add $16, %r11
  270. .if \operation == dec
  271. movdqa \TMP1, %xmm\index
  272. .endif
  273. pshufb SHUF_MASK(%rip), %xmm\index
  274. # prepare plaintext/ciphertext for GHASH computation
  275. .endr
  276. .endif
  277. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  278. # apply GHASH on num_initial_blocks blocks
  279. .if \i == 5
  280. pxor %xmm5, %xmm6
  281. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  282. pxor %xmm6, %xmm7
  283. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  284. pxor %xmm7, %xmm8
  285. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  286. .elseif \i == 6
  287. pxor %xmm6, %xmm7
  288. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  289. pxor %xmm7, %xmm8
  290. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  291. .elseif \i == 7
  292. pxor %xmm7, %xmm8
  293. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  294. .endif
  295. cmp $64, %r13
  296. jl _initial_blocks_done\num_initial_blocks\operation
  297. # no need for precomputed values
  298. /*
  299. *
  300. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  301. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  302. */
  303. paddd ONE(%rip), \XMM0 # INCR Y0
  304. movdqa \XMM0, \XMM1
  305. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  306. paddd ONE(%rip), \XMM0 # INCR Y0
  307. movdqa \XMM0, \XMM2
  308. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  309. paddd ONE(%rip), \XMM0 # INCR Y0
  310. movdqa \XMM0, \XMM3
  311. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  312. paddd ONE(%rip), \XMM0 # INCR Y0
  313. movdqa \XMM0, \XMM4
  314. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  315. pxor 16*0(%arg1), \XMM1
  316. pxor 16*0(%arg1), \XMM2
  317. pxor 16*0(%arg1), \XMM3
  318. pxor 16*0(%arg1), \XMM4
  319. movdqa \TMP3, \TMP5
  320. pshufd $78, \TMP3, \TMP1
  321. pxor \TMP3, \TMP1
  322. movdqa \TMP1, HashKey_k(%rsp)
  323. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  324. # TMP5 = HashKey^2<<1 (mod poly)
  325. movdqa \TMP5, HashKey_2(%rsp)
  326. # HashKey_2 = HashKey^2<<1 (mod poly)
  327. pshufd $78, \TMP5, \TMP1
  328. pxor \TMP5, \TMP1
  329. movdqa \TMP1, HashKey_2_k(%rsp)
  330. .irpc index, 1234 # do 4 rounds
  331. movaps 0x10*\index(%arg1), \TMP1
  332. AESENC \TMP1, \XMM1
  333. AESENC \TMP1, \XMM2
  334. AESENC \TMP1, \XMM3
  335. AESENC \TMP1, \XMM4
  336. .endr
  337. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  338. # TMP5 = HashKey^3<<1 (mod poly)
  339. movdqa \TMP5, HashKey_3(%rsp)
  340. pshufd $78, \TMP5, \TMP1
  341. pxor \TMP5, \TMP1
  342. movdqa \TMP1, HashKey_3_k(%rsp)
  343. .irpc index, 56789 # do next 5 rounds
  344. movaps 0x10*\index(%arg1), \TMP1
  345. AESENC \TMP1, \XMM1
  346. AESENC \TMP1, \XMM2
  347. AESENC \TMP1, \XMM3
  348. AESENC \TMP1, \XMM4
  349. .endr
  350. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  351. # TMP5 = HashKey^3<<1 (mod poly)
  352. movdqa \TMP5, HashKey_4(%rsp)
  353. pshufd $78, \TMP5, \TMP1
  354. pxor \TMP5, \TMP1
  355. movdqa \TMP1, HashKey_4_k(%rsp)
  356. movaps 0xa0(%arg1), \TMP2
  357. AESENCLAST \TMP2, \XMM1
  358. AESENCLAST \TMP2, \XMM2
  359. AESENCLAST \TMP2, \XMM3
  360. AESENCLAST \TMP2, \XMM4
  361. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  362. pxor \TMP1, \XMM1
  363. .if \operation == dec
  364. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  365. movdqa \TMP1, \XMM1
  366. .endif
  367. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  368. pxor \TMP1, \XMM2
  369. .if \operation == dec
  370. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  371. movdqa \TMP1, \XMM2
  372. .endif
  373. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  374. pxor \TMP1, \XMM3
  375. .if \operation == dec
  376. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  377. movdqa \TMP1, \XMM3
  378. .endif
  379. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  380. pxor \TMP1, \XMM4
  381. .if \operation == dec
  382. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  383. movdqa \TMP1, \XMM4
  384. .else
  385. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  386. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  387. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  388. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  389. .endif
  390. add $64, %r11
  391. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  392. pxor \XMMDst, \XMM1
  393. # combine GHASHed value with the corresponding ciphertext
  394. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  395. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  396. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  397. _initial_blocks_done\num_initial_blocks\operation:
  398. .endm
  399. /*
  400. * encrypt 4 blocks at a time
  401. * ghash the 4 previously encrypted ciphertext blocks
  402. * arg1, %arg2, %arg3 are used as pointers only, not modified
  403. * %r11 is the data offset value
  404. */
  405. .macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
  406. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  407. movdqa \XMM1, \XMM5
  408. movdqa \XMM2, \XMM6
  409. movdqa \XMM3, \XMM7
  410. movdqa \XMM4, \XMM8
  411. # multiply TMP5 * HashKey using karatsuba
  412. movdqa \XMM5, \TMP4
  413. pshufd $78, \XMM5, \TMP6
  414. pxor \XMM5, \TMP6
  415. paddd ONE(%rip), \XMM0 # INCR CNT
  416. movdqa HashKey_4(%rsp), \TMP5
  417. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  418. movdqa \XMM0, \XMM1
  419. paddd ONE(%rip), \XMM0 # INCR CNT
  420. movdqa \XMM0, \XMM2
  421. paddd ONE(%rip), \XMM0 # INCR CNT
  422. movdqa \XMM0, \XMM3
  423. paddd ONE(%rip), \XMM0 # INCR CNT
  424. movdqa \XMM0, \XMM4
  425. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  426. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  427. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  428. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  429. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  430. pxor (%arg1), \XMM1
  431. pxor (%arg1), \XMM2
  432. pxor (%arg1), \XMM3
  433. pxor (%arg1), \XMM4
  434. movdqa HashKey_4_k(%rsp), \TMP5
  435. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  436. movaps 0x10(%arg1), \TMP1
  437. AESENC \TMP1, \XMM1 # Round 1
  438. AESENC \TMP1, \XMM2
  439. AESENC \TMP1, \XMM3
  440. AESENC \TMP1, \XMM4
  441. movaps 0x20(%arg1), \TMP1
  442. AESENC \TMP1, \XMM1 # Round 2
  443. AESENC \TMP1, \XMM2
  444. AESENC \TMP1, \XMM3
  445. AESENC \TMP1, \XMM4
  446. movdqa \XMM6, \TMP1
  447. pshufd $78, \XMM6, \TMP2
  448. pxor \XMM6, \TMP2
  449. movdqa HashKey_3(%rsp), \TMP5
  450. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  451. movaps 0x30(%arg1), \TMP3
  452. AESENC \TMP3, \XMM1 # Round 3
  453. AESENC \TMP3, \XMM2
  454. AESENC \TMP3, \XMM3
  455. AESENC \TMP3, \XMM4
  456. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  457. movaps 0x40(%arg1), \TMP3
  458. AESENC \TMP3, \XMM1 # Round 4
  459. AESENC \TMP3, \XMM2
  460. AESENC \TMP3, \XMM3
  461. AESENC \TMP3, \XMM4
  462. movdqa HashKey_3_k(%rsp), \TMP5
  463. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  464. movaps 0x50(%arg1), \TMP3
  465. AESENC \TMP3, \XMM1 # Round 5
  466. AESENC \TMP3, \XMM2
  467. AESENC \TMP3, \XMM3
  468. AESENC \TMP3, \XMM4
  469. pxor \TMP1, \TMP4
  470. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  471. pxor \XMM6, \XMM5
  472. pxor \TMP2, \TMP6
  473. movdqa \XMM7, \TMP1
  474. pshufd $78, \XMM7, \TMP2
  475. pxor \XMM7, \TMP2
  476. movdqa HashKey_2(%rsp ), \TMP5
  477. # Multiply TMP5 * HashKey using karatsuba
  478. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  479. movaps 0x60(%arg1), \TMP3
  480. AESENC \TMP3, \XMM1 # Round 6
  481. AESENC \TMP3, \XMM2
  482. AESENC \TMP3, \XMM3
  483. AESENC \TMP3, \XMM4
  484. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  485. movaps 0x70(%arg1), \TMP3
  486. AESENC \TMP3, \XMM1 # Round 7
  487. AESENC \TMP3, \XMM2
  488. AESENC \TMP3, \XMM3
  489. AESENC \TMP3, \XMM4
  490. movdqa HashKey_2_k(%rsp), \TMP5
  491. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  492. movaps 0x80(%arg1), \TMP3
  493. AESENC \TMP3, \XMM1 # Round 8
  494. AESENC \TMP3, \XMM2
  495. AESENC \TMP3, \XMM3
  496. AESENC \TMP3, \XMM4
  497. pxor \TMP1, \TMP4
  498. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  499. pxor \XMM7, \XMM5
  500. pxor \TMP2, \TMP6
  501. # Multiply XMM8 * HashKey
  502. # XMM8 and TMP5 hold the values for the two operands
  503. movdqa \XMM8, \TMP1
  504. pshufd $78, \XMM8, \TMP2
  505. pxor \XMM8, \TMP2
  506. movdqa HashKey(%rsp), \TMP5
  507. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  508. movaps 0x90(%arg1), \TMP3
  509. AESENC \TMP3, \XMM1 # Round 9
  510. AESENC \TMP3, \XMM2
  511. AESENC \TMP3, \XMM3
  512. AESENC \TMP3, \XMM4
  513. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  514. movaps 0xa0(%arg1), \TMP3
  515. AESENCLAST \TMP3, \XMM1 # Round 10
  516. AESENCLAST \TMP3, \XMM2
  517. AESENCLAST \TMP3, \XMM3
  518. AESENCLAST \TMP3, \XMM4
  519. movdqa HashKey_k(%rsp), \TMP5
  520. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  521. movdqu (%arg3,%r11,1), \TMP3
  522. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  523. .if \operation == dec
  524. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  525. movdqa \TMP3, \XMM1
  526. .endif
  527. movdqu 16(%arg3,%r11,1), \TMP3
  528. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  529. .if \operation == dec
  530. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  531. movdqa \TMP3, \XMM2
  532. .endif
  533. movdqu 32(%arg3,%r11,1), \TMP3
  534. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  535. .if \operation == dec
  536. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  537. movdqa \TMP3, \XMM3
  538. .endif
  539. movdqu 48(%arg3,%r11,1), \TMP3
  540. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  541. .if \operation == dec
  542. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  543. movdqa \TMP3, \XMM4
  544. .else
  545. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  546. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  547. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  548. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  549. .endif
  550. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  551. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  552. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  553. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
  554. pxor \TMP4, \TMP1
  555. pxor \XMM8, \XMM5
  556. pxor \TMP6, \TMP2
  557. pxor \TMP1, \TMP2
  558. pxor \XMM5, \TMP2
  559. movdqa \TMP2, \TMP3
  560. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  561. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  562. pxor \TMP3, \XMM5
  563. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  564. # first phase of reduction
  565. movdqa \XMM5, \TMP2
  566. movdqa \XMM5, \TMP3
  567. movdqa \XMM5, \TMP4
  568. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  569. pslld $31, \TMP2 # packed right shift << 31
  570. pslld $30, \TMP3 # packed right shift << 30
  571. pslld $25, \TMP4 # packed right shift << 25
  572. pxor \TMP3, \TMP2 # xor the shifted versions
  573. pxor \TMP4, \TMP2
  574. movdqa \TMP2, \TMP5
  575. psrldq $4, \TMP5 # right shift T5 1 DW
  576. pslldq $12, \TMP2 # left shift T2 3 DWs
  577. pxor \TMP2, \XMM5
  578. # second phase of reduction
  579. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  580. movdqa \XMM5,\TMP3
  581. movdqa \XMM5,\TMP4
  582. psrld $1, \TMP2 # packed left shift >>1
  583. psrld $2, \TMP3 # packed left shift >>2
  584. psrld $7, \TMP4 # packed left shift >>7
  585. pxor \TMP3,\TMP2 # xor the shifted versions
  586. pxor \TMP4,\TMP2
  587. pxor \TMP5, \TMP2
  588. pxor \TMP2, \XMM5
  589. pxor \TMP1, \XMM5 # result is in TMP1
  590. pxor \XMM5, \XMM1
  591. .endm
  592. /* GHASH the last 4 ciphertext blocks. */
  593. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  594. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  595. # Multiply TMP6 * HashKey (using Karatsuba)
  596. movdqa \XMM1, \TMP6
  597. pshufd $78, \XMM1, \TMP2
  598. pxor \XMM1, \TMP2
  599. movdqa HashKey_4(%rsp), \TMP5
  600. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  601. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  602. movdqa HashKey_4_k(%rsp), \TMP4
  603. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  604. movdqa \XMM1, \XMMDst
  605. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  606. # Multiply TMP1 * HashKey (using Karatsuba)
  607. movdqa \XMM2, \TMP1
  608. pshufd $78, \XMM2, \TMP2
  609. pxor \XMM2, \TMP2
  610. movdqa HashKey_3(%rsp), \TMP5
  611. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  612. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  613. movdqa HashKey_3_k(%rsp), \TMP4
  614. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  615. pxor \TMP1, \TMP6
  616. pxor \XMM2, \XMMDst
  617. pxor \TMP2, \XMM1
  618. # results accumulated in TMP6, XMMDst, XMM1
  619. # Multiply TMP1 * HashKey (using Karatsuba)
  620. movdqa \XMM3, \TMP1
  621. pshufd $78, \XMM3, \TMP2
  622. pxor \XMM3, \TMP2
  623. movdqa HashKey_2(%rsp), \TMP5
  624. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  625. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  626. movdqa HashKey_2_k(%rsp), \TMP4
  627. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  628. pxor \TMP1, \TMP6
  629. pxor \XMM3, \XMMDst
  630. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  631. # Multiply TMP1 * HashKey (using Karatsuba)
  632. movdqa \XMM4, \TMP1
  633. pshufd $78, \XMM4, \TMP2
  634. pxor \XMM4, \TMP2
  635. movdqa HashKey(%rsp), \TMP5
  636. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  637. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  638. movdqa HashKey_k(%rsp), \TMP4
  639. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  640. pxor \TMP1, \TMP6
  641. pxor \XMM4, \XMMDst
  642. pxor \XMM1, \TMP2
  643. pxor \TMP6, \TMP2
  644. pxor \XMMDst, \TMP2
  645. # middle section of the temp results combined as in karatsuba algorithm
  646. movdqa \TMP2, \TMP4
  647. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  648. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  649. pxor \TMP4, \XMMDst
  650. pxor \TMP2, \TMP6
  651. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  652. # first phase of the reduction
  653. movdqa \XMMDst, \TMP2
  654. movdqa \XMMDst, \TMP3
  655. movdqa \XMMDst, \TMP4
  656. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  657. pslld $31, \TMP2 # packed right shifting << 31
  658. pslld $30, \TMP3 # packed right shifting << 30
  659. pslld $25, \TMP4 # packed right shifting << 25
  660. pxor \TMP3, \TMP2 # xor the shifted versions
  661. pxor \TMP4, \TMP2
  662. movdqa \TMP2, \TMP7
  663. psrldq $4, \TMP7 # right shift TMP7 1 DW
  664. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  665. pxor \TMP2, \XMMDst
  666. # second phase of the reduction
  667. movdqa \XMMDst, \TMP2
  668. # make 3 copies of XMMDst for doing 3 shift operations
  669. movdqa \XMMDst, \TMP3
  670. movdqa \XMMDst, \TMP4
  671. psrld $1, \TMP2 # packed left shift >> 1
  672. psrld $2, \TMP3 # packed left shift >> 2
  673. psrld $7, \TMP4 # packed left shift >> 7
  674. pxor \TMP3, \TMP2 # xor the shifted versions
  675. pxor \TMP4, \TMP2
  676. pxor \TMP7, \TMP2
  677. pxor \TMP2, \XMMDst
  678. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  679. .endm
  680. /* Encryption of a single block done*/
  681. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  682. pxor (%arg1), \XMM0
  683. movaps 16(%arg1), \TMP1
  684. AESENC \TMP1, \XMM0
  685. movaps 32(%arg1), \TMP1
  686. AESENC \TMP1, \XMM0
  687. movaps 48(%arg1), \TMP1
  688. AESENC \TMP1, \XMM0
  689. movaps 64(%arg1), \TMP1
  690. AESENC \TMP1, \XMM0
  691. movaps 80(%arg1), \TMP1
  692. AESENC \TMP1, \XMM0
  693. movaps 96(%arg1), \TMP1
  694. AESENC \TMP1, \XMM0
  695. movaps 112(%arg1), \TMP1
  696. AESENC \TMP1, \XMM0
  697. movaps 128(%arg1), \TMP1
  698. AESENC \TMP1, \XMM0
  699. movaps 144(%arg1), \TMP1
  700. AESENC \TMP1, \XMM0
  701. movaps 160(%arg1), \TMP1
  702. AESENCLAST \TMP1, \XMM0
  703. .endm
  704. /*****************************************************************************
  705. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  706. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  707. * const u8 *in, // Ciphertext input
  708. * u64 plaintext_len, // Length of data in bytes for decryption.
  709. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  710. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  711. * // concatenated with 0x00000001. 16-byte aligned pointer.
  712. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  713. * const u8 *aad, // Additional Authentication Data (AAD)
  714. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  715. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  716. * // given authentication tag and only return the plaintext if they match.
  717. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  718. * // (most likely), 12 or 8.
  719. *
  720. * Assumptions:
  721. *
  722. * keys:
  723. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  724. * set of 11 keys in the data structure void *aes_ctx
  725. *
  726. * iv:
  727. * 0 1 2 3
  728. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  729. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  730. * | Salt (From the SA) |
  731. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  732. * | Initialization Vector |
  733. * | (This is the sequence number from IPSec header) |
  734. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  735. * | 0x1 |
  736. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  737. *
  738. *
  739. *
  740. * AAD:
  741. * AAD padded to 128 bits with 0
  742. * for example, assume AAD is a u32 vector
  743. *
  744. * if AAD is 8 bytes:
  745. * AAD[3] = {A0, A1};
  746. * padded AAD in xmm register = {A1 A0 0 0}
  747. *
  748. * 0 1 2 3
  749. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  750. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  751. * | SPI (A1) |
  752. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  753. * | 32-bit Sequence Number (A0) |
  754. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  755. * | 0x0 |
  756. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  757. *
  758. * AAD Format with 32-bit Sequence Number
  759. *
  760. * if AAD is 12 bytes:
  761. * AAD[3] = {A0, A1, A2};
  762. * padded AAD in xmm register = {A2 A1 A0 0}
  763. *
  764. * 0 1 2 3
  765. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  766. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  767. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  768. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  769. * | SPI (A2) |
  770. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  771. * | 64-bit Extended Sequence Number {A1,A0} |
  772. * | |
  773. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  774. * | 0x0 |
  775. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  776. *
  777. * AAD Format with 64-bit Extended Sequence Number
  778. *
  779. * aadLen:
  780. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  781. * The code supports 16 too but for other sizes, the code will fail.
  782. *
  783. * TLen:
  784. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  785. * For other sizes, the code will fail.
  786. *
  787. * poly = x^128 + x^127 + x^126 + x^121 + 1
  788. *
  789. *****************************************************************************/
  790. ENTRY(aesni_gcm_dec)
  791. push %r12
  792. push %r13
  793. push %r14
  794. mov %rsp, %r14
  795. /*
  796. * states of %xmm registers %xmm6:%xmm15 not saved
  797. * all %xmm registers are clobbered
  798. */
  799. sub $VARIABLE_OFFSET, %rsp
  800. and $~63, %rsp # align rsp to 64 bytes
  801. mov %arg6, %r12
  802. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  803. pshufb SHUF_MASK(%rip), %xmm13
  804. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  805. movdqa %xmm13, %xmm2
  806. psllq $1, %xmm13
  807. psrlq $63, %xmm2
  808. movdqa %xmm2, %xmm1
  809. pslldq $8, %xmm2
  810. psrldq $8, %xmm1
  811. por %xmm2, %xmm13
  812. # Reduction
  813. pshufd $0x24, %xmm1, %xmm2
  814. pcmpeqd TWOONE(%rip), %xmm2
  815. pand POLY(%rip), %xmm2
  816. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  817. # Decrypt first few blocks
  818. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  819. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  820. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  821. mov %r13, %r12
  822. and $(3<<4), %r12
  823. jz _initial_num_blocks_is_0_decrypt
  824. cmp $(2<<4), %r12
  825. jb _initial_num_blocks_is_1_decrypt
  826. je _initial_num_blocks_is_2_decrypt
  827. _initial_num_blocks_is_3_decrypt:
  828. INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  829. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  830. sub $48, %r13
  831. jmp _initial_blocks_decrypted
  832. _initial_num_blocks_is_2_decrypt:
  833. INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  834. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  835. sub $32, %r13
  836. jmp _initial_blocks_decrypted
  837. _initial_num_blocks_is_1_decrypt:
  838. INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  839. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  840. sub $16, %r13
  841. jmp _initial_blocks_decrypted
  842. _initial_num_blocks_is_0_decrypt:
  843. INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  844. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  845. _initial_blocks_decrypted:
  846. cmp $0, %r13
  847. je _zero_cipher_left_decrypt
  848. sub $64, %r13
  849. je _four_cipher_left_decrypt
  850. _decrypt_by_4:
  851. GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  852. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  853. add $64, %r11
  854. sub $64, %r13
  855. jne _decrypt_by_4
  856. _four_cipher_left_decrypt:
  857. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  858. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  859. _zero_cipher_left_decrypt:
  860. mov %arg4, %r13
  861. and $15, %r13 # %r13 = arg4 (mod 16)
  862. je _multiple_of_16_bytes_decrypt
  863. # Handle the last <16 byte block seperately
  864. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  865. pshufb SHUF_MASK(%rip), %xmm0
  866. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  867. sub $16, %r11
  868. add %r13, %r11
  869. movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
  870. lea SHIFT_MASK+16(%rip), %r12
  871. sub %r13, %r12
  872. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  873. # (%r13 is the number of bytes in plaintext mod 16)
  874. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  875. pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
  876. movdqa %xmm1, %xmm2
  877. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  878. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  879. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  880. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  881. pand %xmm1, %xmm2
  882. pshufb SHUF_MASK(%rip),%xmm2
  883. pxor %xmm2, %xmm8
  884. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  885. # GHASH computation for the last <16 byte block
  886. sub %r13, %r11
  887. add $16, %r11
  888. # output %r13 bytes
  889. movq %xmm0, %rax
  890. cmp $8, %r13
  891. jle _less_than_8_bytes_left_decrypt
  892. mov %rax, (%arg2 , %r11, 1)
  893. add $8, %r11
  894. psrldq $8, %xmm0
  895. movq %xmm0, %rax
  896. sub $8, %r13
  897. _less_than_8_bytes_left_decrypt:
  898. mov %al, (%arg2, %r11, 1)
  899. add $1, %r11
  900. shr $8, %rax
  901. sub $1, %r13
  902. jne _less_than_8_bytes_left_decrypt
  903. _multiple_of_16_bytes_decrypt:
  904. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  905. shl $3, %r12 # convert into number of bits
  906. movd %r12d, %xmm15 # len(A) in %xmm15
  907. shl $3, %arg4 # len(C) in bits (*128)
  908. movq %arg4, %xmm1
  909. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  910. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  911. pxor %xmm15, %xmm8
  912. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  913. # final GHASH computation
  914. pshufb SHUF_MASK(%rip), %xmm8
  915. mov %arg5, %rax # %rax = *Y0
  916. movdqu (%rax), %xmm0 # %xmm0 = Y0
  917. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  918. pxor %xmm8, %xmm0
  919. _return_T_decrypt:
  920. mov arg9, %r10 # %r10 = authTag
  921. mov arg10, %r11 # %r11 = auth_tag_len
  922. cmp $16, %r11
  923. je _T_16_decrypt
  924. cmp $12, %r11
  925. je _T_12_decrypt
  926. _T_8_decrypt:
  927. movq %xmm0, %rax
  928. mov %rax, (%r10)
  929. jmp _return_T_done_decrypt
  930. _T_12_decrypt:
  931. movq %xmm0, %rax
  932. mov %rax, (%r10)
  933. psrldq $8, %xmm0
  934. movd %xmm0, %eax
  935. mov %eax, 8(%r10)
  936. jmp _return_T_done_decrypt
  937. _T_16_decrypt:
  938. movdqu %xmm0, (%r10)
  939. _return_T_done_decrypt:
  940. mov %r14, %rsp
  941. pop %r14
  942. pop %r13
  943. pop %r12
  944. ret
  945. /*****************************************************************************
  946. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  947. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  948. * const u8 *in, // Plaintext input
  949. * u64 plaintext_len, // Length of data in bytes for encryption.
  950. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  951. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  952. * // concatenated with 0x00000001. 16-byte aligned pointer.
  953. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  954. * const u8 *aad, // Additional Authentication Data (AAD)
  955. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  956. * u8 *auth_tag, // Authenticated Tag output.
  957. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  958. * // 12 or 8.
  959. *
  960. * Assumptions:
  961. *
  962. * keys:
  963. * keys are pre-expanded and aligned to 16 bytes. we are using the
  964. * first set of 11 keys in the data structure void *aes_ctx
  965. *
  966. *
  967. * iv:
  968. * 0 1 2 3
  969. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  970. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  971. * | Salt (From the SA) |
  972. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  973. * | Initialization Vector |
  974. * | (This is the sequence number from IPSec header) |
  975. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  976. * | 0x1 |
  977. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  978. *
  979. *
  980. *
  981. * AAD:
  982. * AAD padded to 128 bits with 0
  983. * for example, assume AAD is a u32 vector
  984. *
  985. * if AAD is 8 bytes:
  986. * AAD[3] = {A0, A1};
  987. * padded AAD in xmm register = {A1 A0 0 0}
  988. *
  989. * 0 1 2 3
  990. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  991. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  992. * | SPI (A1) |
  993. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  994. * | 32-bit Sequence Number (A0) |
  995. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  996. * | 0x0 |
  997. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  998. *
  999. * AAD Format with 32-bit Sequence Number
  1000. *
  1001. * if AAD is 12 bytes:
  1002. * AAD[3] = {A0, A1, A2};
  1003. * padded AAD in xmm register = {A2 A1 A0 0}
  1004. *
  1005. * 0 1 2 3
  1006. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1007. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1008. * | SPI (A2) |
  1009. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1010. * | 64-bit Extended Sequence Number {A1,A0} |
  1011. * | |
  1012. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1013. * | 0x0 |
  1014. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1015. *
  1016. * AAD Format with 64-bit Extended Sequence Number
  1017. *
  1018. * aadLen:
  1019. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1020. * The code supports 16 too but for other sizes, the code will fail.
  1021. *
  1022. * TLen:
  1023. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1024. * For other sizes, the code will fail.
  1025. *
  1026. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1027. ***************************************************************************/
  1028. ENTRY(aesni_gcm_enc)
  1029. push %r12
  1030. push %r13
  1031. push %r14
  1032. mov %rsp, %r14
  1033. #
  1034. # states of %xmm registers %xmm6:%xmm15 not saved
  1035. # all %xmm registers are clobbered
  1036. #
  1037. sub $VARIABLE_OFFSET, %rsp
  1038. and $~63, %rsp
  1039. mov %arg6, %r12
  1040. movdqu (%r12), %xmm13
  1041. pshufb SHUF_MASK(%rip), %xmm13
  1042. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1043. movdqa %xmm13, %xmm2
  1044. psllq $1, %xmm13
  1045. psrlq $63, %xmm2
  1046. movdqa %xmm2, %xmm1
  1047. pslldq $8, %xmm2
  1048. psrldq $8, %xmm1
  1049. por %xmm2, %xmm13
  1050. # reduce HashKey<<1
  1051. pshufd $0x24, %xmm1, %xmm2
  1052. pcmpeqd TWOONE(%rip), %xmm2
  1053. pand POLY(%rip), %xmm2
  1054. pxor %xmm2, %xmm13
  1055. movdqa %xmm13, HashKey(%rsp)
  1056. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1057. and $-16, %r13
  1058. mov %r13, %r12
  1059. # Encrypt first few blocks
  1060. and $(3<<4), %r12
  1061. jz _initial_num_blocks_is_0_encrypt
  1062. cmp $(2<<4), %r12
  1063. jb _initial_num_blocks_is_1_encrypt
  1064. je _initial_num_blocks_is_2_encrypt
  1065. _initial_num_blocks_is_3_encrypt:
  1066. INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1067. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1068. sub $48, %r13
  1069. jmp _initial_blocks_encrypted
  1070. _initial_num_blocks_is_2_encrypt:
  1071. INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1072. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1073. sub $32, %r13
  1074. jmp _initial_blocks_encrypted
  1075. _initial_num_blocks_is_1_encrypt:
  1076. INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1077. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1078. sub $16, %r13
  1079. jmp _initial_blocks_encrypted
  1080. _initial_num_blocks_is_0_encrypt:
  1081. INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1082. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1083. _initial_blocks_encrypted:
  1084. # Main loop - Encrypt remaining blocks
  1085. cmp $0, %r13
  1086. je _zero_cipher_left_encrypt
  1087. sub $64, %r13
  1088. je _four_cipher_left_encrypt
  1089. _encrypt_by_4_encrypt:
  1090. GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1091. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1092. add $64, %r11
  1093. sub $64, %r13
  1094. jne _encrypt_by_4_encrypt
  1095. _four_cipher_left_encrypt:
  1096. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1097. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1098. _zero_cipher_left_encrypt:
  1099. mov %arg4, %r13
  1100. and $15, %r13 # %r13 = arg4 (mod 16)
  1101. je _multiple_of_16_bytes_encrypt
  1102. # Handle the last <16 Byte block seperately
  1103. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1104. pshufb SHUF_MASK(%rip), %xmm0
  1105. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1106. sub $16, %r11
  1107. add %r13, %r11
  1108. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1109. lea SHIFT_MASK+16(%rip), %r12
  1110. sub %r13, %r12
  1111. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1112. # (%r13 is the number of bytes in plaintext mod 16)
  1113. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1114. pshufb %xmm2, %xmm1 # shift right 16-r13 byte
  1115. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1116. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1117. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1118. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1119. pshufb SHUF_MASK(%rip),%xmm0
  1120. pxor %xmm0, %xmm8
  1121. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1122. # GHASH computation for the last <16 byte block
  1123. sub %r13, %r11
  1124. add $16, %r11
  1125. pshufb SHUF_MASK(%rip), %xmm0
  1126. # shuffle xmm0 back to output as ciphertext
  1127. # Output %r13 bytes
  1128. movq %xmm0, %rax
  1129. cmp $8, %r13
  1130. jle _less_than_8_bytes_left_encrypt
  1131. mov %rax, (%arg2 , %r11, 1)
  1132. add $8, %r11
  1133. psrldq $8, %xmm0
  1134. movq %xmm0, %rax
  1135. sub $8, %r13
  1136. _less_than_8_bytes_left_encrypt:
  1137. mov %al, (%arg2, %r11, 1)
  1138. add $1, %r11
  1139. shr $8, %rax
  1140. sub $1, %r13
  1141. jne _less_than_8_bytes_left_encrypt
  1142. _multiple_of_16_bytes_encrypt:
  1143. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1144. shl $3, %r12
  1145. movd %r12d, %xmm15 # len(A) in %xmm15
  1146. shl $3, %arg4 # len(C) in bits (*128)
  1147. movq %arg4, %xmm1
  1148. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1149. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1150. pxor %xmm15, %xmm8
  1151. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1152. # final GHASH computation
  1153. pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
  1154. mov %arg5, %rax # %rax = *Y0
  1155. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1156. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1157. pxor %xmm8, %xmm0
  1158. _return_T_encrypt:
  1159. mov arg9, %r10 # %r10 = authTag
  1160. mov arg10, %r11 # %r11 = auth_tag_len
  1161. cmp $16, %r11
  1162. je _T_16_encrypt
  1163. cmp $12, %r11
  1164. je _T_12_encrypt
  1165. _T_8_encrypt:
  1166. movq %xmm0, %rax
  1167. mov %rax, (%r10)
  1168. jmp _return_T_done_encrypt
  1169. _T_12_encrypt:
  1170. movq %xmm0, %rax
  1171. mov %rax, (%r10)
  1172. psrldq $8, %xmm0
  1173. movd %xmm0, %eax
  1174. mov %eax, 8(%r10)
  1175. jmp _return_T_done_encrypt
  1176. _T_16_encrypt:
  1177. movdqu %xmm0, (%r10)
  1178. _return_T_done_encrypt:
  1179. mov %r14, %rsp
  1180. pop %r14
  1181. pop %r13
  1182. pop %r12
  1183. ret
  1184. #endif
  1185. _key_expansion_128:
  1186. _key_expansion_256a:
  1187. pshufd $0b11111111, %xmm1, %xmm1
  1188. shufps $0b00010000, %xmm0, %xmm4
  1189. pxor %xmm4, %xmm0
  1190. shufps $0b10001100, %xmm0, %xmm4
  1191. pxor %xmm4, %xmm0
  1192. pxor %xmm1, %xmm0
  1193. movaps %xmm0, (TKEYP)
  1194. add $0x10, TKEYP
  1195. ret
  1196. .align 4
  1197. _key_expansion_192a:
  1198. pshufd $0b01010101, %xmm1, %xmm1
  1199. shufps $0b00010000, %xmm0, %xmm4
  1200. pxor %xmm4, %xmm0
  1201. shufps $0b10001100, %xmm0, %xmm4
  1202. pxor %xmm4, %xmm0
  1203. pxor %xmm1, %xmm0
  1204. movaps %xmm2, %xmm5
  1205. movaps %xmm2, %xmm6
  1206. pslldq $4, %xmm5
  1207. pshufd $0b11111111, %xmm0, %xmm3
  1208. pxor %xmm3, %xmm2
  1209. pxor %xmm5, %xmm2
  1210. movaps %xmm0, %xmm1
  1211. shufps $0b01000100, %xmm0, %xmm6
  1212. movaps %xmm6, (TKEYP)
  1213. shufps $0b01001110, %xmm2, %xmm1
  1214. movaps %xmm1, 0x10(TKEYP)
  1215. add $0x20, TKEYP
  1216. ret
  1217. .align 4
  1218. _key_expansion_192b:
  1219. pshufd $0b01010101, %xmm1, %xmm1
  1220. shufps $0b00010000, %xmm0, %xmm4
  1221. pxor %xmm4, %xmm0
  1222. shufps $0b10001100, %xmm0, %xmm4
  1223. pxor %xmm4, %xmm0
  1224. pxor %xmm1, %xmm0
  1225. movaps %xmm2, %xmm5
  1226. pslldq $4, %xmm5
  1227. pshufd $0b11111111, %xmm0, %xmm3
  1228. pxor %xmm3, %xmm2
  1229. pxor %xmm5, %xmm2
  1230. movaps %xmm0, (TKEYP)
  1231. add $0x10, TKEYP
  1232. ret
  1233. .align 4
  1234. _key_expansion_256b:
  1235. pshufd $0b10101010, %xmm1, %xmm1
  1236. shufps $0b00010000, %xmm2, %xmm4
  1237. pxor %xmm4, %xmm2
  1238. shufps $0b10001100, %xmm2, %xmm4
  1239. pxor %xmm4, %xmm2
  1240. pxor %xmm1, %xmm2
  1241. movaps %xmm2, (TKEYP)
  1242. add $0x10, TKEYP
  1243. ret
  1244. /*
  1245. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1246. * unsigned int key_len)
  1247. */
  1248. ENTRY(aesni_set_key)
  1249. #ifndef __x86_64__
  1250. pushl KEYP
  1251. movl 8(%esp), KEYP # ctx
  1252. movl 12(%esp), UKEYP # in_key
  1253. movl 16(%esp), %edx # key_len
  1254. #endif
  1255. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1256. movaps %xmm0, (KEYP)
  1257. lea 0x10(KEYP), TKEYP # key addr
  1258. movl %edx, 480(KEYP)
  1259. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1260. cmp $24, %dl
  1261. jb .Lenc_key128
  1262. je .Lenc_key192
  1263. movups 0x10(UKEYP), %xmm2 # other user key
  1264. movaps %xmm2, (TKEYP)
  1265. add $0x10, TKEYP
  1266. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1267. call _key_expansion_256a
  1268. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1269. call _key_expansion_256b
  1270. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1271. call _key_expansion_256a
  1272. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1273. call _key_expansion_256b
  1274. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1275. call _key_expansion_256a
  1276. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1277. call _key_expansion_256b
  1278. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1279. call _key_expansion_256a
  1280. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1281. call _key_expansion_256b
  1282. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1283. call _key_expansion_256a
  1284. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1285. call _key_expansion_256b
  1286. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1287. call _key_expansion_256a
  1288. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1289. call _key_expansion_256b
  1290. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1291. call _key_expansion_256a
  1292. jmp .Ldec_key
  1293. .Lenc_key192:
  1294. movq 0x10(UKEYP), %xmm2 # other user key
  1295. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1296. call _key_expansion_192a
  1297. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1298. call _key_expansion_192b
  1299. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1300. call _key_expansion_192a
  1301. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1302. call _key_expansion_192b
  1303. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1304. call _key_expansion_192a
  1305. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1306. call _key_expansion_192b
  1307. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1308. call _key_expansion_192a
  1309. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1310. call _key_expansion_192b
  1311. jmp .Ldec_key
  1312. .Lenc_key128:
  1313. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1314. call _key_expansion_128
  1315. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1316. call _key_expansion_128
  1317. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1318. call _key_expansion_128
  1319. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1320. call _key_expansion_128
  1321. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1322. call _key_expansion_128
  1323. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1324. call _key_expansion_128
  1325. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1326. call _key_expansion_128
  1327. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1328. call _key_expansion_128
  1329. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1330. call _key_expansion_128
  1331. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1332. call _key_expansion_128
  1333. .Ldec_key:
  1334. sub $0x10, TKEYP
  1335. movaps (KEYP), %xmm0
  1336. movaps (TKEYP), %xmm1
  1337. movaps %xmm0, 240(TKEYP)
  1338. movaps %xmm1, 240(KEYP)
  1339. add $0x10, KEYP
  1340. lea 240-16(TKEYP), UKEYP
  1341. .align 4
  1342. .Ldec_key_loop:
  1343. movaps (KEYP), %xmm0
  1344. AESIMC %xmm0 %xmm1
  1345. movaps %xmm1, (UKEYP)
  1346. add $0x10, KEYP
  1347. sub $0x10, UKEYP
  1348. cmp TKEYP, KEYP
  1349. jb .Ldec_key_loop
  1350. xor AREG, AREG
  1351. #ifndef __x86_64__
  1352. popl KEYP
  1353. #endif
  1354. ret
  1355. /*
  1356. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1357. */
  1358. ENTRY(aesni_enc)
  1359. #ifndef __x86_64__
  1360. pushl KEYP
  1361. pushl KLEN
  1362. movl 12(%esp), KEYP
  1363. movl 16(%esp), OUTP
  1364. movl 20(%esp), INP
  1365. #endif
  1366. movl 480(KEYP), KLEN # key length
  1367. movups (INP), STATE # input
  1368. call _aesni_enc1
  1369. movups STATE, (OUTP) # output
  1370. #ifndef __x86_64__
  1371. popl KLEN
  1372. popl KEYP
  1373. #endif
  1374. ret
  1375. /*
  1376. * _aesni_enc1: internal ABI
  1377. * input:
  1378. * KEYP: key struct pointer
  1379. * KLEN: round count
  1380. * STATE: initial state (input)
  1381. * output:
  1382. * STATE: finial state (output)
  1383. * changed:
  1384. * KEY
  1385. * TKEYP (T1)
  1386. */
  1387. .align 4
  1388. _aesni_enc1:
  1389. movaps (KEYP), KEY # key
  1390. mov KEYP, TKEYP
  1391. pxor KEY, STATE # round 0
  1392. add $0x30, TKEYP
  1393. cmp $24, KLEN
  1394. jb .Lenc128
  1395. lea 0x20(TKEYP), TKEYP
  1396. je .Lenc192
  1397. add $0x20, TKEYP
  1398. movaps -0x60(TKEYP), KEY
  1399. AESENC KEY STATE
  1400. movaps -0x50(TKEYP), KEY
  1401. AESENC KEY STATE
  1402. .align 4
  1403. .Lenc192:
  1404. movaps -0x40(TKEYP), KEY
  1405. AESENC KEY STATE
  1406. movaps -0x30(TKEYP), KEY
  1407. AESENC KEY STATE
  1408. .align 4
  1409. .Lenc128:
  1410. movaps -0x20(TKEYP), KEY
  1411. AESENC KEY STATE
  1412. movaps -0x10(TKEYP), KEY
  1413. AESENC KEY STATE
  1414. movaps (TKEYP), KEY
  1415. AESENC KEY STATE
  1416. movaps 0x10(TKEYP), KEY
  1417. AESENC KEY STATE
  1418. movaps 0x20(TKEYP), KEY
  1419. AESENC KEY STATE
  1420. movaps 0x30(TKEYP), KEY
  1421. AESENC KEY STATE
  1422. movaps 0x40(TKEYP), KEY
  1423. AESENC KEY STATE
  1424. movaps 0x50(TKEYP), KEY
  1425. AESENC KEY STATE
  1426. movaps 0x60(TKEYP), KEY
  1427. AESENC KEY STATE
  1428. movaps 0x70(TKEYP), KEY
  1429. AESENCLAST KEY STATE
  1430. ret
  1431. /*
  1432. * _aesni_enc4: internal ABI
  1433. * input:
  1434. * KEYP: key struct pointer
  1435. * KLEN: round count
  1436. * STATE1: initial state (input)
  1437. * STATE2
  1438. * STATE3
  1439. * STATE4
  1440. * output:
  1441. * STATE1: finial state (output)
  1442. * STATE2
  1443. * STATE3
  1444. * STATE4
  1445. * changed:
  1446. * KEY
  1447. * TKEYP (T1)
  1448. */
  1449. .align 4
  1450. _aesni_enc4:
  1451. movaps (KEYP), KEY # key
  1452. mov KEYP, TKEYP
  1453. pxor KEY, STATE1 # round 0
  1454. pxor KEY, STATE2
  1455. pxor KEY, STATE3
  1456. pxor KEY, STATE4
  1457. add $0x30, TKEYP
  1458. cmp $24, KLEN
  1459. jb .L4enc128
  1460. lea 0x20(TKEYP), TKEYP
  1461. je .L4enc192
  1462. add $0x20, TKEYP
  1463. movaps -0x60(TKEYP), KEY
  1464. AESENC KEY STATE1
  1465. AESENC KEY STATE2
  1466. AESENC KEY STATE3
  1467. AESENC KEY STATE4
  1468. movaps -0x50(TKEYP), KEY
  1469. AESENC KEY STATE1
  1470. AESENC KEY STATE2
  1471. AESENC KEY STATE3
  1472. AESENC KEY STATE4
  1473. #.align 4
  1474. .L4enc192:
  1475. movaps -0x40(TKEYP), KEY
  1476. AESENC KEY STATE1
  1477. AESENC KEY STATE2
  1478. AESENC KEY STATE3
  1479. AESENC KEY STATE4
  1480. movaps -0x30(TKEYP), KEY
  1481. AESENC KEY STATE1
  1482. AESENC KEY STATE2
  1483. AESENC KEY STATE3
  1484. AESENC KEY STATE4
  1485. #.align 4
  1486. .L4enc128:
  1487. movaps -0x20(TKEYP), KEY
  1488. AESENC KEY STATE1
  1489. AESENC KEY STATE2
  1490. AESENC KEY STATE3
  1491. AESENC KEY STATE4
  1492. movaps -0x10(TKEYP), KEY
  1493. AESENC KEY STATE1
  1494. AESENC KEY STATE2
  1495. AESENC KEY STATE3
  1496. AESENC KEY STATE4
  1497. movaps (TKEYP), KEY
  1498. AESENC KEY STATE1
  1499. AESENC KEY STATE2
  1500. AESENC KEY STATE3
  1501. AESENC KEY STATE4
  1502. movaps 0x10(TKEYP), KEY
  1503. AESENC KEY STATE1
  1504. AESENC KEY STATE2
  1505. AESENC KEY STATE3
  1506. AESENC KEY STATE4
  1507. movaps 0x20(TKEYP), KEY
  1508. AESENC KEY STATE1
  1509. AESENC KEY STATE2
  1510. AESENC KEY STATE3
  1511. AESENC KEY STATE4
  1512. movaps 0x30(TKEYP), KEY
  1513. AESENC KEY STATE1
  1514. AESENC KEY STATE2
  1515. AESENC KEY STATE3
  1516. AESENC KEY STATE4
  1517. movaps 0x40(TKEYP), KEY
  1518. AESENC KEY STATE1
  1519. AESENC KEY STATE2
  1520. AESENC KEY STATE3
  1521. AESENC KEY STATE4
  1522. movaps 0x50(TKEYP), KEY
  1523. AESENC KEY STATE1
  1524. AESENC KEY STATE2
  1525. AESENC KEY STATE3
  1526. AESENC KEY STATE4
  1527. movaps 0x60(TKEYP), KEY
  1528. AESENC KEY STATE1
  1529. AESENC KEY STATE2
  1530. AESENC KEY STATE3
  1531. AESENC KEY STATE4
  1532. movaps 0x70(TKEYP), KEY
  1533. AESENCLAST KEY STATE1 # last round
  1534. AESENCLAST KEY STATE2
  1535. AESENCLAST KEY STATE3
  1536. AESENCLAST KEY STATE4
  1537. ret
  1538. /*
  1539. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1540. */
  1541. ENTRY(aesni_dec)
  1542. #ifndef __x86_64__
  1543. pushl KEYP
  1544. pushl KLEN
  1545. movl 12(%esp), KEYP
  1546. movl 16(%esp), OUTP
  1547. movl 20(%esp), INP
  1548. #endif
  1549. mov 480(KEYP), KLEN # key length
  1550. add $240, KEYP
  1551. movups (INP), STATE # input
  1552. call _aesni_dec1
  1553. movups STATE, (OUTP) #output
  1554. #ifndef __x86_64__
  1555. popl KLEN
  1556. popl KEYP
  1557. #endif
  1558. ret
  1559. /*
  1560. * _aesni_dec1: internal ABI
  1561. * input:
  1562. * KEYP: key struct pointer
  1563. * KLEN: key length
  1564. * STATE: initial state (input)
  1565. * output:
  1566. * STATE: finial state (output)
  1567. * changed:
  1568. * KEY
  1569. * TKEYP (T1)
  1570. */
  1571. .align 4
  1572. _aesni_dec1:
  1573. movaps (KEYP), KEY # key
  1574. mov KEYP, TKEYP
  1575. pxor KEY, STATE # round 0
  1576. add $0x30, TKEYP
  1577. cmp $24, KLEN
  1578. jb .Ldec128
  1579. lea 0x20(TKEYP), TKEYP
  1580. je .Ldec192
  1581. add $0x20, TKEYP
  1582. movaps -0x60(TKEYP), KEY
  1583. AESDEC KEY STATE
  1584. movaps -0x50(TKEYP), KEY
  1585. AESDEC KEY STATE
  1586. .align 4
  1587. .Ldec192:
  1588. movaps -0x40(TKEYP), KEY
  1589. AESDEC KEY STATE
  1590. movaps -0x30(TKEYP), KEY
  1591. AESDEC KEY STATE
  1592. .align 4
  1593. .Ldec128:
  1594. movaps -0x20(TKEYP), KEY
  1595. AESDEC KEY STATE
  1596. movaps -0x10(TKEYP), KEY
  1597. AESDEC KEY STATE
  1598. movaps (TKEYP), KEY
  1599. AESDEC KEY STATE
  1600. movaps 0x10(TKEYP), KEY
  1601. AESDEC KEY STATE
  1602. movaps 0x20(TKEYP), KEY
  1603. AESDEC KEY STATE
  1604. movaps 0x30(TKEYP), KEY
  1605. AESDEC KEY STATE
  1606. movaps 0x40(TKEYP), KEY
  1607. AESDEC KEY STATE
  1608. movaps 0x50(TKEYP), KEY
  1609. AESDEC KEY STATE
  1610. movaps 0x60(TKEYP), KEY
  1611. AESDEC KEY STATE
  1612. movaps 0x70(TKEYP), KEY
  1613. AESDECLAST KEY STATE
  1614. ret
  1615. /*
  1616. * _aesni_dec4: internal ABI
  1617. * input:
  1618. * KEYP: key struct pointer
  1619. * KLEN: key length
  1620. * STATE1: initial state (input)
  1621. * STATE2
  1622. * STATE3
  1623. * STATE4
  1624. * output:
  1625. * STATE1: finial state (output)
  1626. * STATE2
  1627. * STATE3
  1628. * STATE4
  1629. * changed:
  1630. * KEY
  1631. * TKEYP (T1)
  1632. */
  1633. .align 4
  1634. _aesni_dec4:
  1635. movaps (KEYP), KEY # key
  1636. mov KEYP, TKEYP
  1637. pxor KEY, STATE1 # round 0
  1638. pxor KEY, STATE2
  1639. pxor KEY, STATE3
  1640. pxor KEY, STATE4
  1641. add $0x30, TKEYP
  1642. cmp $24, KLEN
  1643. jb .L4dec128
  1644. lea 0x20(TKEYP), TKEYP
  1645. je .L4dec192
  1646. add $0x20, TKEYP
  1647. movaps -0x60(TKEYP), KEY
  1648. AESDEC KEY STATE1
  1649. AESDEC KEY STATE2
  1650. AESDEC KEY STATE3
  1651. AESDEC KEY STATE4
  1652. movaps -0x50(TKEYP), KEY
  1653. AESDEC KEY STATE1
  1654. AESDEC KEY STATE2
  1655. AESDEC KEY STATE3
  1656. AESDEC KEY STATE4
  1657. .align 4
  1658. .L4dec192:
  1659. movaps -0x40(TKEYP), KEY
  1660. AESDEC KEY STATE1
  1661. AESDEC KEY STATE2
  1662. AESDEC KEY STATE3
  1663. AESDEC KEY STATE4
  1664. movaps -0x30(TKEYP), KEY
  1665. AESDEC KEY STATE1
  1666. AESDEC KEY STATE2
  1667. AESDEC KEY STATE3
  1668. AESDEC KEY STATE4
  1669. .align 4
  1670. .L4dec128:
  1671. movaps -0x20(TKEYP), KEY
  1672. AESDEC KEY STATE1
  1673. AESDEC KEY STATE2
  1674. AESDEC KEY STATE3
  1675. AESDEC KEY STATE4
  1676. movaps -0x10(TKEYP), KEY
  1677. AESDEC KEY STATE1
  1678. AESDEC KEY STATE2
  1679. AESDEC KEY STATE3
  1680. AESDEC KEY STATE4
  1681. movaps (TKEYP), KEY
  1682. AESDEC KEY STATE1
  1683. AESDEC KEY STATE2
  1684. AESDEC KEY STATE3
  1685. AESDEC KEY STATE4
  1686. movaps 0x10(TKEYP), KEY
  1687. AESDEC KEY STATE1
  1688. AESDEC KEY STATE2
  1689. AESDEC KEY STATE3
  1690. AESDEC KEY STATE4
  1691. movaps 0x20(TKEYP), KEY
  1692. AESDEC KEY STATE1
  1693. AESDEC KEY STATE2
  1694. AESDEC KEY STATE3
  1695. AESDEC KEY STATE4
  1696. movaps 0x30(TKEYP), KEY
  1697. AESDEC KEY STATE1
  1698. AESDEC KEY STATE2
  1699. AESDEC KEY STATE3
  1700. AESDEC KEY STATE4
  1701. movaps 0x40(TKEYP), KEY
  1702. AESDEC KEY STATE1
  1703. AESDEC KEY STATE2
  1704. AESDEC KEY STATE3
  1705. AESDEC KEY STATE4
  1706. movaps 0x50(TKEYP), KEY
  1707. AESDEC KEY STATE1
  1708. AESDEC KEY STATE2
  1709. AESDEC KEY STATE3
  1710. AESDEC KEY STATE4
  1711. movaps 0x60(TKEYP), KEY
  1712. AESDEC KEY STATE1
  1713. AESDEC KEY STATE2
  1714. AESDEC KEY STATE3
  1715. AESDEC KEY STATE4
  1716. movaps 0x70(TKEYP), KEY
  1717. AESDECLAST KEY STATE1 # last round
  1718. AESDECLAST KEY STATE2
  1719. AESDECLAST KEY STATE3
  1720. AESDECLAST KEY STATE4
  1721. ret
  1722. /*
  1723. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1724. * size_t len)
  1725. */
  1726. ENTRY(aesni_ecb_enc)
  1727. #ifndef __x86_64__
  1728. pushl LEN
  1729. pushl KEYP
  1730. pushl KLEN
  1731. movl 16(%esp), KEYP
  1732. movl 20(%esp), OUTP
  1733. movl 24(%esp), INP
  1734. movl 28(%esp), LEN
  1735. #endif
  1736. test LEN, LEN # check length
  1737. jz .Lecb_enc_ret
  1738. mov 480(KEYP), KLEN
  1739. cmp $16, LEN
  1740. jb .Lecb_enc_ret
  1741. cmp $64, LEN
  1742. jb .Lecb_enc_loop1
  1743. .align 4
  1744. .Lecb_enc_loop4:
  1745. movups (INP), STATE1
  1746. movups 0x10(INP), STATE2
  1747. movups 0x20(INP), STATE3
  1748. movups 0x30(INP), STATE4
  1749. call _aesni_enc4
  1750. movups STATE1, (OUTP)
  1751. movups STATE2, 0x10(OUTP)
  1752. movups STATE3, 0x20(OUTP)
  1753. movups STATE4, 0x30(OUTP)
  1754. sub $64, LEN
  1755. add $64, INP
  1756. add $64, OUTP
  1757. cmp $64, LEN
  1758. jge .Lecb_enc_loop4
  1759. cmp $16, LEN
  1760. jb .Lecb_enc_ret
  1761. .align 4
  1762. .Lecb_enc_loop1:
  1763. movups (INP), STATE1
  1764. call _aesni_enc1
  1765. movups STATE1, (OUTP)
  1766. sub $16, LEN
  1767. add $16, INP
  1768. add $16, OUTP
  1769. cmp $16, LEN
  1770. jge .Lecb_enc_loop1
  1771. .Lecb_enc_ret:
  1772. #ifndef __x86_64__
  1773. popl KLEN
  1774. popl KEYP
  1775. popl LEN
  1776. #endif
  1777. ret
  1778. /*
  1779. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1780. * size_t len);
  1781. */
  1782. ENTRY(aesni_ecb_dec)
  1783. #ifndef __x86_64__
  1784. pushl LEN
  1785. pushl KEYP
  1786. pushl KLEN
  1787. movl 16(%esp), KEYP
  1788. movl 20(%esp), OUTP
  1789. movl 24(%esp), INP
  1790. movl 28(%esp), LEN
  1791. #endif
  1792. test LEN, LEN
  1793. jz .Lecb_dec_ret
  1794. mov 480(KEYP), KLEN
  1795. add $240, KEYP
  1796. cmp $16, LEN
  1797. jb .Lecb_dec_ret
  1798. cmp $64, LEN
  1799. jb .Lecb_dec_loop1
  1800. .align 4
  1801. .Lecb_dec_loop4:
  1802. movups (INP), STATE1
  1803. movups 0x10(INP), STATE2
  1804. movups 0x20(INP), STATE3
  1805. movups 0x30(INP), STATE4
  1806. call _aesni_dec4
  1807. movups STATE1, (OUTP)
  1808. movups STATE2, 0x10(OUTP)
  1809. movups STATE3, 0x20(OUTP)
  1810. movups STATE4, 0x30(OUTP)
  1811. sub $64, LEN
  1812. add $64, INP
  1813. add $64, OUTP
  1814. cmp $64, LEN
  1815. jge .Lecb_dec_loop4
  1816. cmp $16, LEN
  1817. jb .Lecb_dec_ret
  1818. .align 4
  1819. .Lecb_dec_loop1:
  1820. movups (INP), STATE1
  1821. call _aesni_dec1
  1822. movups STATE1, (OUTP)
  1823. sub $16, LEN
  1824. add $16, INP
  1825. add $16, OUTP
  1826. cmp $16, LEN
  1827. jge .Lecb_dec_loop1
  1828. .Lecb_dec_ret:
  1829. #ifndef __x86_64__
  1830. popl KLEN
  1831. popl KEYP
  1832. popl LEN
  1833. #endif
  1834. ret
  1835. /*
  1836. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1837. * size_t len, u8 *iv)
  1838. */
  1839. ENTRY(aesni_cbc_enc)
  1840. #ifndef __x86_64__
  1841. pushl IVP
  1842. pushl LEN
  1843. pushl KEYP
  1844. pushl KLEN
  1845. movl 20(%esp), KEYP
  1846. movl 24(%esp), OUTP
  1847. movl 28(%esp), INP
  1848. movl 32(%esp), LEN
  1849. movl 36(%esp), IVP
  1850. #endif
  1851. cmp $16, LEN
  1852. jb .Lcbc_enc_ret
  1853. mov 480(KEYP), KLEN
  1854. movups (IVP), STATE # load iv as initial state
  1855. .align 4
  1856. .Lcbc_enc_loop:
  1857. movups (INP), IN # load input
  1858. pxor IN, STATE
  1859. call _aesni_enc1
  1860. movups STATE, (OUTP) # store output
  1861. sub $16, LEN
  1862. add $16, INP
  1863. add $16, OUTP
  1864. cmp $16, LEN
  1865. jge .Lcbc_enc_loop
  1866. movups STATE, (IVP)
  1867. .Lcbc_enc_ret:
  1868. #ifndef __x86_64__
  1869. popl KLEN
  1870. popl KEYP
  1871. popl LEN
  1872. popl IVP
  1873. #endif
  1874. ret
  1875. /*
  1876. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1877. * size_t len, u8 *iv)
  1878. */
  1879. ENTRY(aesni_cbc_dec)
  1880. #ifndef __x86_64__
  1881. pushl IVP
  1882. pushl LEN
  1883. pushl KEYP
  1884. pushl KLEN
  1885. movl 20(%esp), KEYP
  1886. movl 24(%esp), OUTP
  1887. movl 28(%esp), INP
  1888. movl 32(%esp), LEN
  1889. movl 36(%esp), IVP
  1890. #endif
  1891. cmp $16, LEN
  1892. jb .Lcbc_dec_just_ret
  1893. mov 480(KEYP), KLEN
  1894. add $240, KEYP
  1895. movups (IVP), IV
  1896. cmp $64, LEN
  1897. jb .Lcbc_dec_loop1
  1898. .align 4
  1899. .Lcbc_dec_loop4:
  1900. movups (INP), IN1
  1901. movaps IN1, STATE1
  1902. movups 0x10(INP), IN2
  1903. movaps IN2, STATE2
  1904. #ifdef __x86_64__
  1905. movups 0x20(INP), IN3
  1906. movaps IN3, STATE3
  1907. movups 0x30(INP), IN4
  1908. movaps IN4, STATE4
  1909. #else
  1910. movups 0x20(INP), IN1
  1911. movaps IN1, STATE3
  1912. movups 0x30(INP), IN2
  1913. movaps IN2, STATE4
  1914. #endif
  1915. call _aesni_dec4
  1916. pxor IV, STATE1
  1917. #ifdef __x86_64__
  1918. pxor IN1, STATE2
  1919. pxor IN2, STATE3
  1920. pxor IN3, STATE4
  1921. movaps IN4, IV
  1922. #else
  1923. pxor (INP), STATE2
  1924. pxor 0x10(INP), STATE3
  1925. pxor IN1, STATE4
  1926. movaps IN2, IV
  1927. #endif
  1928. movups STATE1, (OUTP)
  1929. movups STATE2, 0x10(OUTP)
  1930. movups STATE3, 0x20(OUTP)
  1931. movups STATE4, 0x30(OUTP)
  1932. sub $64, LEN
  1933. add $64, INP
  1934. add $64, OUTP
  1935. cmp $64, LEN
  1936. jge .Lcbc_dec_loop4
  1937. cmp $16, LEN
  1938. jb .Lcbc_dec_ret
  1939. .align 4
  1940. .Lcbc_dec_loop1:
  1941. movups (INP), IN
  1942. movaps IN, STATE
  1943. call _aesni_dec1
  1944. pxor IV, STATE
  1945. movups STATE, (OUTP)
  1946. movaps IN, IV
  1947. sub $16, LEN
  1948. add $16, INP
  1949. add $16, OUTP
  1950. cmp $16, LEN
  1951. jge .Lcbc_dec_loop1
  1952. .Lcbc_dec_ret:
  1953. movups IV, (IVP)
  1954. .Lcbc_dec_just_ret:
  1955. #ifndef __x86_64__
  1956. popl KLEN
  1957. popl KEYP
  1958. popl LEN
  1959. popl IVP
  1960. #endif
  1961. ret
  1962. #ifdef __x86_64__
  1963. .align 16
  1964. .Lbswap_mask:
  1965. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  1966. /*
  1967. * _aesni_inc_init: internal ABI
  1968. * setup registers used by _aesni_inc
  1969. * input:
  1970. * IV
  1971. * output:
  1972. * CTR: == IV, in little endian
  1973. * TCTR_LOW: == lower qword of CTR
  1974. * INC: == 1, in little endian
  1975. * BSWAP_MASK == endian swapping mask
  1976. */
  1977. .align 4
  1978. _aesni_inc_init:
  1979. movaps .Lbswap_mask, BSWAP_MASK
  1980. movaps IV, CTR
  1981. PSHUFB_XMM BSWAP_MASK CTR
  1982. mov $1, TCTR_LOW
  1983. MOVQ_R64_XMM TCTR_LOW INC
  1984. MOVQ_R64_XMM CTR TCTR_LOW
  1985. ret
  1986. /*
  1987. * _aesni_inc: internal ABI
  1988. * Increase IV by 1, IV is in big endian
  1989. * input:
  1990. * IV
  1991. * CTR: == IV, in little endian
  1992. * TCTR_LOW: == lower qword of CTR
  1993. * INC: == 1, in little endian
  1994. * BSWAP_MASK == endian swapping mask
  1995. * output:
  1996. * IV: Increase by 1
  1997. * changed:
  1998. * CTR: == output IV, in little endian
  1999. * TCTR_LOW: == lower qword of CTR
  2000. */
  2001. .align 4
  2002. _aesni_inc:
  2003. paddq INC, CTR
  2004. add $1, TCTR_LOW
  2005. jnc .Linc_low
  2006. pslldq $8, INC
  2007. paddq INC, CTR
  2008. psrldq $8, INC
  2009. .Linc_low:
  2010. movaps CTR, IV
  2011. PSHUFB_XMM BSWAP_MASK IV
  2012. ret
  2013. /*
  2014. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2015. * size_t len, u8 *iv)
  2016. */
  2017. ENTRY(aesni_ctr_enc)
  2018. cmp $16, LEN
  2019. jb .Lctr_enc_just_ret
  2020. mov 480(KEYP), KLEN
  2021. movups (IVP), IV
  2022. call _aesni_inc_init
  2023. cmp $64, LEN
  2024. jb .Lctr_enc_loop1
  2025. .align 4
  2026. .Lctr_enc_loop4:
  2027. movaps IV, STATE1
  2028. call _aesni_inc
  2029. movups (INP), IN1
  2030. movaps IV, STATE2
  2031. call _aesni_inc
  2032. movups 0x10(INP), IN2
  2033. movaps IV, STATE3
  2034. call _aesni_inc
  2035. movups 0x20(INP), IN3
  2036. movaps IV, STATE4
  2037. call _aesni_inc
  2038. movups 0x30(INP), IN4
  2039. call _aesni_enc4
  2040. pxor IN1, STATE1
  2041. movups STATE1, (OUTP)
  2042. pxor IN2, STATE2
  2043. movups STATE2, 0x10(OUTP)
  2044. pxor IN3, STATE3
  2045. movups STATE3, 0x20(OUTP)
  2046. pxor IN4, STATE4
  2047. movups STATE4, 0x30(OUTP)
  2048. sub $64, LEN
  2049. add $64, INP
  2050. add $64, OUTP
  2051. cmp $64, LEN
  2052. jge .Lctr_enc_loop4
  2053. cmp $16, LEN
  2054. jb .Lctr_enc_ret
  2055. .align 4
  2056. .Lctr_enc_loop1:
  2057. movaps IV, STATE
  2058. call _aesni_inc
  2059. movups (INP), IN
  2060. call _aesni_enc1
  2061. pxor IN, STATE
  2062. movups STATE, (OUTP)
  2063. sub $16, LEN
  2064. add $16, INP
  2065. add $16, OUTP
  2066. cmp $16, LEN
  2067. jge .Lctr_enc_loop1
  2068. .Lctr_enc_ret:
  2069. movups IV, (IVP)
  2070. .Lctr_enc_just_ret:
  2071. ret
  2072. #endif