aesni-intel_asm.S 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. .data
  34. POLY: .octa 0xC2000000000000000000000000000001
  35. TWOONE: .octa 0x00000001000000000000000000000001
  36. # order of these constants should not change.
  37. # more specifically, ALL_F should follow SHIFT_MASK,
  38. # and ZERO should follow ALL_F
  39. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  40. MASK1: .octa 0x0000000000000000ffffffffffffffff
  41. MASK2: .octa 0xffffffffffffffff0000000000000000
  42. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  43. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  44. ZERO: .octa 0x00000000000000000000000000000000
  45. ONE: .octa 0x00000000000000000000000000000001
  46. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  47. dec: .octa 0x1
  48. enc: .octa 0x2
  49. .text
  50. #define STACK_OFFSET 8*3
  51. #define HashKey 16*0 // store HashKey <<1 mod poly here
  52. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  53. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  54. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  55. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  56. // bits of HashKey <<1 mod poly here
  57. //(for Karatsuba purposes)
  58. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  59. // bits of HashKey^2 <<1 mod poly here
  60. // (for Karatsuba purposes)
  61. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  62. // bits of HashKey^3 <<1 mod poly here
  63. // (for Karatsuba purposes)
  64. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  65. // bits of HashKey^4 <<1 mod poly here
  66. // (for Karatsuba purposes)
  67. #define VARIABLE_OFFSET 16*8
  68. #define arg1 rdi
  69. #define arg2 rsi
  70. #define arg3 rdx
  71. #define arg4 rcx
  72. #define arg5 r8
  73. #define arg6 r9
  74. #define arg7 STACK_OFFSET+8(%r14)
  75. #define arg8 STACK_OFFSET+16(%r14)
  76. #define arg9 STACK_OFFSET+24(%r14)
  77. #define arg10 STACK_OFFSET+32(%r14)
  78. #define STATE1 %xmm0
  79. #define STATE2 %xmm4
  80. #define STATE3 %xmm5
  81. #define STATE4 %xmm6
  82. #define STATE STATE1
  83. #define IN1 %xmm1
  84. #define IN2 %xmm7
  85. #define IN3 %xmm8
  86. #define IN4 %xmm9
  87. #define IN IN1
  88. #define KEY %xmm2
  89. #define IV %xmm3
  90. #define BSWAP_MASK %xmm10
  91. #define CTR %xmm11
  92. #define INC %xmm12
  93. #ifdef __x86_64__
  94. #define AREG %rax
  95. #define KEYP %rdi
  96. #define OUTP %rsi
  97. #define UKEYP OUTP
  98. #define INP %rdx
  99. #define LEN %rcx
  100. #define IVP %r8
  101. #define KLEN %r9d
  102. #define T1 %r10
  103. #define TKEYP T1
  104. #define T2 %r11
  105. #define TCTR_LOW T2
  106. #else
  107. #define AREG %eax
  108. #define KEYP %edi
  109. #define OUTP AREG
  110. #define UKEYP OUTP
  111. #define INP %edx
  112. #define LEN %esi
  113. #define IVP %ebp
  114. #define KLEN %ebx
  115. #define T1 %ecx
  116. #define TKEYP T1
  117. #endif
  118. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  119. *
  120. *
  121. * Input: A and B (128-bits each, bit-reflected)
  122. * Output: C = A*B*x mod poly, (i.e. >>1 )
  123. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  124. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  125. *
  126. */
  127. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  128. movdqa \GH, \TMP1
  129. pshufd $78, \GH, \TMP2
  130. pshufd $78, \HK, \TMP3
  131. pxor \GH, \TMP2 # TMP2 = a1+a0
  132. pxor \HK, \TMP3 # TMP3 = b1+b0
  133. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  134. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  135. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  136. pxor \GH, \TMP2
  137. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  138. movdqa \TMP2, \TMP3
  139. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  140. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  141. pxor \TMP3, \GH
  142. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  143. # first phase of the reduction
  144. movdqa \GH, \TMP2
  145. movdqa \GH, \TMP3
  146. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  147. # in in order to perform
  148. # independent shifts
  149. pslld $31, \TMP2 # packed right shift <<31
  150. pslld $30, \TMP3 # packed right shift <<30
  151. pslld $25, \TMP4 # packed right shift <<25
  152. pxor \TMP3, \TMP2 # xor the shifted versions
  153. pxor \TMP4, \TMP2
  154. movdqa \TMP2, \TMP5
  155. psrldq $4, \TMP5 # right shift TMP5 1 DW
  156. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  157. pxor \TMP2, \GH
  158. # second phase of the reduction
  159. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  160. # in in order to perform
  161. # independent shifts
  162. movdqa \GH,\TMP3
  163. movdqa \GH,\TMP4
  164. psrld $1,\TMP2 # packed left shift >>1
  165. psrld $2,\TMP3 # packed left shift >>2
  166. psrld $7,\TMP4 # packed left shift >>7
  167. pxor \TMP3,\TMP2 # xor the shifted versions
  168. pxor \TMP4,\TMP2
  169. pxor \TMP5, \TMP2
  170. pxor \TMP2, \GH
  171. pxor \TMP1, \GH # result is in TMP1
  172. .endm
  173. /*
  174. * if a = number of total plaintext bytes
  175. * b = floor(a/16)
  176. * num_initial_blocks = b mod 4
  177. * encrypt the initial num_initial_blocks blocks and apply ghash on
  178. * the ciphertext
  179. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  180. * are clobbered
  181. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  182. */
  183. .macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  184. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  185. mov arg7, %r10 # %r10 = AAD
  186. mov arg8, %r12 # %r12 = aadLen
  187. mov %r12, %r11
  188. pxor %xmm\i, %xmm\i
  189. _get_AAD_loop\num_initial_blocks\operation:
  190. movd (%r10), \TMP1
  191. pslldq $12, \TMP1
  192. psrldq $4, %xmm\i
  193. pxor \TMP1, %xmm\i
  194. add $4, %r10
  195. sub $4, %r12
  196. jne _get_AAD_loop\num_initial_blocks\operation
  197. cmp $16, %r11
  198. je _get_AAD_loop2_done\num_initial_blocks\operation
  199. mov $16, %r12
  200. _get_AAD_loop2\num_initial_blocks\operation:
  201. psrldq $4, %xmm\i
  202. sub $4, %r12
  203. cmp %r11, %r12
  204. jne _get_AAD_loop2\num_initial_blocks\operation
  205. _get_AAD_loop2_done\num_initial_blocks\operation:
  206. pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
  207. xor %r11, %r11 # initialise the data pointer offset as zero
  208. # start AES for num_initial_blocks blocks
  209. mov %arg5, %rax # %rax = *Y0
  210. movdqu (%rax), \XMM0 # XMM0 = Y0
  211. pshufb SHUF_MASK(%rip), \XMM0
  212. .if \i_seq != 0
  213. .irpc index, \i_seq
  214. paddd ONE(%rip), \XMM0 # INCR Y0
  215. movdqa \XMM0, %xmm\index
  216. pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
  217. .endr
  218. .irpc index, \i_seq
  219. pxor 16*0(%arg1), %xmm\index
  220. .endr
  221. .irpc index, \i_seq
  222. movaps 0x10(%rdi), \TMP1
  223. AESENC \TMP1, %xmm\index # Round 1
  224. .endr
  225. .irpc index, \i_seq
  226. movaps 0x20(%arg1), \TMP1
  227. AESENC \TMP1, %xmm\index # Round 2
  228. .endr
  229. .irpc index, \i_seq
  230. movaps 0x30(%arg1), \TMP1
  231. AESENC \TMP1, %xmm\index # Round 2
  232. .endr
  233. .irpc index, \i_seq
  234. movaps 0x40(%arg1), \TMP1
  235. AESENC \TMP1, %xmm\index # Round 2
  236. .endr
  237. .irpc index, \i_seq
  238. movaps 0x50(%arg1), \TMP1
  239. AESENC \TMP1, %xmm\index # Round 2
  240. .endr
  241. .irpc index, \i_seq
  242. movaps 0x60(%arg1), \TMP1
  243. AESENC \TMP1, %xmm\index # Round 2
  244. .endr
  245. .irpc index, \i_seq
  246. movaps 0x70(%arg1), \TMP1
  247. AESENC \TMP1, %xmm\index # Round 2
  248. .endr
  249. .irpc index, \i_seq
  250. movaps 0x80(%arg1), \TMP1
  251. AESENC \TMP1, %xmm\index # Round 2
  252. .endr
  253. .irpc index, \i_seq
  254. movaps 0x90(%arg1), \TMP1
  255. AESENC \TMP1, %xmm\index # Round 2
  256. .endr
  257. .irpc index, \i_seq
  258. movaps 0xa0(%arg1), \TMP1
  259. AESENCLAST \TMP1, %xmm\index # Round 10
  260. .endr
  261. .irpc index, \i_seq
  262. movdqu (%arg3 , %r11, 1), \TMP1
  263. pxor \TMP1, %xmm\index
  264. movdqu %xmm\index, (%arg2 , %r11, 1)
  265. # write back plaintext/ciphertext for num_initial_blocks
  266. add $16, %r11
  267. .if \operation == dec
  268. movdqa \TMP1, %xmm\index
  269. .endif
  270. pshufb SHUF_MASK(%rip), %xmm\index
  271. # prepare plaintext/ciphertext for GHASH computation
  272. .endr
  273. .endif
  274. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  275. # apply GHASH on num_initial_blocks blocks
  276. .if \i == 5
  277. pxor %xmm5, %xmm6
  278. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  279. pxor %xmm6, %xmm7
  280. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  281. pxor %xmm7, %xmm8
  282. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  283. .elseif \i == 6
  284. pxor %xmm6, %xmm7
  285. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  286. pxor %xmm7, %xmm8
  287. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  288. .elseif \i == 7
  289. pxor %xmm7, %xmm8
  290. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  291. .endif
  292. cmp $64, %r13
  293. jl _initial_blocks_done\num_initial_blocks\operation
  294. # no need for precomputed values
  295. /*
  296. *
  297. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  298. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  299. */
  300. paddd ONE(%rip), \XMM0 # INCR Y0
  301. movdqa \XMM0, \XMM1
  302. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  303. paddd ONE(%rip), \XMM0 # INCR Y0
  304. movdqa \XMM0, \XMM2
  305. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  306. paddd ONE(%rip), \XMM0 # INCR Y0
  307. movdqa \XMM0, \XMM3
  308. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  309. paddd ONE(%rip), \XMM0 # INCR Y0
  310. movdqa \XMM0, \XMM4
  311. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  312. pxor 16*0(%arg1), \XMM1
  313. pxor 16*0(%arg1), \XMM2
  314. pxor 16*0(%arg1), \XMM3
  315. pxor 16*0(%arg1), \XMM4
  316. movdqa \TMP3, \TMP5
  317. pshufd $78, \TMP3, \TMP1
  318. pxor \TMP3, \TMP1
  319. movdqa \TMP1, HashKey_k(%rsp)
  320. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  321. # TMP5 = HashKey^2<<1 (mod poly)
  322. movdqa \TMP5, HashKey_2(%rsp)
  323. # HashKey_2 = HashKey^2<<1 (mod poly)
  324. pshufd $78, \TMP5, \TMP1
  325. pxor \TMP5, \TMP1
  326. movdqa \TMP1, HashKey_2_k(%rsp)
  327. .irpc index, 1234 # do 4 rounds
  328. movaps 0x10*\index(%arg1), \TMP1
  329. AESENC \TMP1, \XMM1
  330. AESENC \TMP1, \XMM2
  331. AESENC \TMP1, \XMM3
  332. AESENC \TMP1, \XMM4
  333. .endr
  334. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  335. # TMP5 = HashKey^3<<1 (mod poly)
  336. movdqa \TMP5, HashKey_3(%rsp)
  337. pshufd $78, \TMP5, \TMP1
  338. pxor \TMP5, \TMP1
  339. movdqa \TMP1, HashKey_3_k(%rsp)
  340. .irpc index, 56789 # do next 5 rounds
  341. movaps 0x10*\index(%arg1), \TMP1
  342. AESENC \TMP1, \XMM1
  343. AESENC \TMP1, \XMM2
  344. AESENC \TMP1, \XMM3
  345. AESENC \TMP1, \XMM4
  346. .endr
  347. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  348. # TMP5 = HashKey^3<<1 (mod poly)
  349. movdqa \TMP5, HashKey_4(%rsp)
  350. pshufd $78, \TMP5, \TMP1
  351. pxor \TMP5, \TMP1
  352. movdqa \TMP1, HashKey_4_k(%rsp)
  353. movaps 0xa0(%arg1), \TMP2
  354. AESENCLAST \TMP2, \XMM1
  355. AESENCLAST \TMP2, \XMM2
  356. AESENCLAST \TMP2, \XMM3
  357. AESENCLAST \TMP2, \XMM4
  358. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  359. pxor \TMP1, \XMM1
  360. .if \operation == dec
  361. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  362. movdqa \TMP1, \XMM1
  363. .endif
  364. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  365. pxor \TMP1, \XMM2
  366. .if \operation == dec
  367. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  368. movdqa \TMP1, \XMM2
  369. .endif
  370. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  371. pxor \TMP1, \XMM3
  372. .if \operation == dec
  373. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  374. movdqa \TMP1, \XMM3
  375. .endif
  376. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  377. pxor \TMP1, \XMM4
  378. .if \operation == dec
  379. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  380. movdqa \TMP1, \XMM4
  381. .else
  382. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  383. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  384. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  385. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  386. .endif
  387. add $64, %r11
  388. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  389. pxor \XMMDst, \XMM1
  390. # combine GHASHed value with the corresponding ciphertext
  391. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  392. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  393. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  394. _initial_blocks_done\num_initial_blocks\operation:
  395. .endm
  396. /*
  397. * encrypt 4 blocks at a time
  398. * ghash the 4 previously encrypted ciphertext blocks
  399. * arg1, %arg2, %arg3 are used as pointers only, not modified
  400. * %r11 is the data offset value
  401. */
  402. .macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
  403. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  404. movdqa \XMM1, \XMM5
  405. movdqa \XMM2, \XMM6
  406. movdqa \XMM3, \XMM7
  407. movdqa \XMM4, \XMM8
  408. # multiply TMP5 * HashKey using karatsuba
  409. movdqa \XMM5, \TMP4
  410. pshufd $78, \XMM5, \TMP6
  411. pxor \XMM5, \TMP6
  412. paddd ONE(%rip), \XMM0 # INCR CNT
  413. movdqa HashKey_4(%rsp), \TMP5
  414. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  415. movdqa \XMM0, \XMM1
  416. paddd ONE(%rip), \XMM0 # INCR CNT
  417. movdqa \XMM0, \XMM2
  418. paddd ONE(%rip), \XMM0 # INCR CNT
  419. movdqa \XMM0, \XMM3
  420. paddd ONE(%rip), \XMM0 # INCR CNT
  421. movdqa \XMM0, \XMM4
  422. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  423. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  424. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  425. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  426. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
  427. pxor (%arg1), \XMM1
  428. pxor (%arg1), \XMM2
  429. pxor (%arg1), \XMM3
  430. pxor (%arg1), \XMM4
  431. movdqa HashKey_4_k(%rsp), \TMP5
  432. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  433. movaps 0x10(%arg1), \TMP1
  434. AESENC \TMP1, \XMM1 # Round 1
  435. AESENC \TMP1, \XMM2
  436. AESENC \TMP1, \XMM3
  437. AESENC \TMP1, \XMM4
  438. movaps 0x20(%arg1), \TMP1
  439. AESENC \TMP1, \XMM1 # Round 2
  440. AESENC \TMP1, \XMM2
  441. AESENC \TMP1, \XMM3
  442. AESENC \TMP1, \XMM4
  443. movdqa \XMM6, \TMP1
  444. pshufd $78, \XMM6, \TMP2
  445. pxor \XMM6, \TMP2
  446. movdqa HashKey_3(%rsp), \TMP5
  447. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  448. movaps 0x30(%arg1), \TMP3
  449. AESENC \TMP3, \XMM1 # Round 3
  450. AESENC \TMP3, \XMM2
  451. AESENC \TMP3, \XMM3
  452. AESENC \TMP3, \XMM4
  453. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  454. movaps 0x40(%arg1), \TMP3
  455. AESENC \TMP3, \XMM1 # Round 4
  456. AESENC \TMP3, \XMM2
  457. AESENC \TMP3, \XMM3
  458. AESENC \TMP3, \XMM4
  459. movdqa HashKey_3_k(%rsp), \TMP5
  460. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  461. movaps 0x50(%arg1), \TMP3
  462. AESENC \TMP3, \XMM1 # Round 5
  463. AESENC \TMP3, \XMM2
  464. AESENC \TMP3, \XMM3
  465. AESENC \TMP3, \XMM4
  466. pxor \TMP1, \TMP4
  467. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  468. pxor \XMM6, \XMM5
  469. pxor \TMP2, \TMP6
  470. movdqa \XMM7, \TMP1
  471. pshufd $78, \XMM7, \TMP2
  472. pxor \XMM7, \TMP2
  473. movdqa HashKey_2(%rsp ), \TMP5
  474. # Multiply TMP5 * HashKey using karatsuba
  475. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  476. movaps 0x60(%arg1), \TMP3
  477. AESENC \TMP3, \XMM1 # Round 6
  478. AESENC \TMP3, \XMM2
  479. AESENC \TMP3, \XMM3
  480. AESENC \TMP3, \XMM4
  481. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  482. movaps 0x70(%arg1), \TMP3
  483. AESENC \TMP3, \XMM1 # Round 7
  484. AESENC \TMP3, \XMM2
  485. AESENC \TMP3, \XMM3
  486. AESENC \TMP3, \XMM4
  487. movdqa HashKey_2_k(%rsp), \TMP5
  488. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  489. movaps 0x80(%arg1), \TMP3
  490. AESENC \TMP3, \XMM1 # Round 8
  491. AESENC \TMP3, \XMM2
  492. AESENC \TMP3, \XMM3
  493. AESENC \TMP3, \XMM4
  494. pxor \TMP1, \TMP4
  495. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  496. pxor \XMM7, \XMM5
  497. pxor \TMP2, \TMP6
  498. # Multiply XMM8 * HashKey
  499. # XMM8 and TMP5 hold the values for the two operands
  500. movdqa \XMM8, \TMP1
  501. pshufd $78, \XMM8, \TMP2
  502. pxor \XMM8, \TMP2
  503. movdqa HashKey(%rsp), \TMP5
  504. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  505. movaps 0x90(%arg1), \TMP3
  506. AESENC \TMP3, \XMM1 # Round 9
  507. AESENC \TMP3, \XMM2
  508. AESENC \TMP3, \XMM3
  509. AESENC \TMP3, \XMM4
  510. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  511. movaps 0xa0(%arg1), \TMP3
  512. AESENCLAST \TMP3, \XMM1 # Round 10
  513. AESENCLAST \TMP3, \XMM2
  514. AESENCLAST \TMP3, \XMM3
  515. AESENCLAST \TMP3, \XMM4
  516. movdqa HashKey_k(%rsp), \TMP5
  517. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  518. movdqu (%arg3,%r11,1), \TMP3
  519. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  520. .if \operation == dec
  521. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  522. movdqa \TMP3, \XMM1
  523. .endif
  524. movdqu 16(%arg3,%r11,1), \TMP3
  525. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  526. .if \operation == dec
  527. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  528. movdqa \TMP3, \XMM2
  529. .endif
  530. movdqu 32(%arg3,%r11,1), \TMP3
  531. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  532. .if \operation == dec
  533. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  534. movdqa \TMP3, \XMM3
  535. .endif
  536. movdqu 48(%arg3,%r11,1), \TMP3
  537. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  538. .if \operation == dec
  539. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  540. movdqa \TMP3, \XMM4
  541. .else
  542. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  543. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  544. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  545. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  546. .endif
  547. pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
  548. pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
  549. pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
  550. pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
  551. pxor \TMP4, \TMP1
  552. pxor \XMM8, \XMM5
  553. pxor \TMP6, \TMP2
  554. pxor \TMP1, \TMP2
  555. pxor \XMM5, \TMP2
  556. movdqa \TMP2, \TMP3
  557. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  558. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  559. pxor \TMP3, \XMM5
  560. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  561. # first phase of reduction
  562. movdqa \XMM5, \TMP2
  563. movdqa \XMM5, \TMP3
  564. movdqa \XMM5, \TMP4
  565. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  566. pslld $31, \TMP2 # packed right shift << 31
  567. pslld $30, \TMP3 # packed right shift << 30
  568. pslld $25, \TMP4 # packed right shift << 25
  569. pxor \TMP3, \TMP2 # xor the shifted versions
  570. pxor \TMP4, \TMP2
  571. movdqa \TMP2, \TMP5
  572. psrldq $4, \TMP5 # right shift T5 1 DW
  573. pslldq $12, \TMP2 # left shift T2 3 DWs
  574. pxor \TMP2, \XMM5
  575. # second phase of reduction
  576. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  577. movdqa \XMM5,\TMP3
  578. movdqa \XMM5,\TMP4
  579. psrld $1, \TMP2 # packed left shift >>1
  580. psrld $2, \TMP3 # packed left shift >>2
  581. psrld $7, \TMP4 # packed left shift >>7
  582. pxor \TMP3,\TMP2 # xor the shifted versions
  583. pxor \TMP4,\TMP2
  584. pxor \TMP5, \TMP2
  585. pxor \TMP2, \XMM5
  586. pxor \TMP1, \XMM5 # result is in TMP1
  587. pxor \XMM5, \XMM1
  588. .endm
  589. /* GHASH the last 4 ciphertext blocks. */
  590. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  591. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  592. # Multiply TMP6 * HashKey (using Karatsuba)
  593. movdqa \XMM1, \TMP6
  594. pshufd $78, \XMM1, \TMP2
  595. pxor \XMM1, \TMP2
  596. movdqa HashKey_4(%rsp), \TMP5
  597. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  598. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  599. movdqa HashKey_4_k(%rsp), \TMP4
  600. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  601. movdqa \XMM1, \XMMDst
  602. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  603. # Multiply TMP1 * HashKey (using Karatsuba)
  604. movdqa \XMM2, \TMP1
  605. pshufd $78, \XMM2, \TMP2
  606. pxor \XMM2, \TMP2
  607. movdqa HashKey_3(%rsp), \TMP5
  608. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  609. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  610. movdqa HashKey_3_k(%rsp), \TMP4
  611. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  612. pxor \TMP1, \TMP6
  613. pxor \XMM2, \XMMDst
  614. pxor \TMP2, \XMM1
  615. # results accumulated in TMP6, XMMDst, XMM1
  616. # Multiply TMP1 * HashKey (using Karatsuba)
  617. movdqa \XMM3, \TMP1
  618. pshufd $78, \XMM3, \TMP2
  619. pxor \XMM3, \TMP2
  620. movdqa HashKey_2(%rsp), \TMP5
  621. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  622. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  623. movdqa HashKey_2_k(%rsp), \TMP4
  624. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  625. pxor \TMP1, \TMP6
  626. pxor \XMM3, \XMMDst
  627. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  628. # Multiply TMP1 * HashKey (using Karatsuba)
  629. movdqa \XMM4, \TMP1
  630. pshufd $78, \XMM4, \TMP2
  631. pxor \XMM4, \TMP2
  632. movdqa HashKey(%rsp), \TMP5
  633. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  634. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  635. movdqa HashKey_k(%rsp), \TMP4
  636. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  637. pxor \TMP1, \TMP6
  638. pxor \XMM4, \XMMDst
  639. pxor \XMM1, \TMP2
  640. pxor \TMP6, \TMP2
  641. pxor \XMMDst, \TMP2
  642. # middle section of the temp results combined as in karatsuba algorithm
  643. movdqa \TMP2, \TMP4
  644. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  645. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  646. pxor \TMP4, \XMMDst
  647. pxor \TMP2, \TMP6
  648. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  649. # first phase of the reduction
  650. movdqa \XMMDst, \TMP2
  651. movdqa \XMMDst, \TMP3
  652. movdqa \XMMDst, \TMP4
  653. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  654. pslld $31, \TMP2 # packed right shifting << 31
  655. pslld $30, \TMP3 # packed right shifting << 30
  656. pslld $25, \TMP4 # packed right shifting << 25
  657. pxor \TMP3, \TMP2 # xor the shifted versions
  658. pxor \TMP4, \TMP2
  659. movdqa \TMP2, \TMP7
  660. psrldq $4, \TMP7 # right shift TMP7 1 DW
  661. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  662. pxor \TMP2, \XMMDst
  663. # second phase of the reduction
  664. movdqa \XMMDst, \TMP2
  665. # make 3 copies of XMMDst for doing 3 shift operations
  666. movdqa \XMMDst, \TMP3
  667. movdqa \XMMDst, \TMP4
  668. psrld $1, \TMP2 # packed left shift >> 1
  669. psrld $2, \TMP3 # packed left shift >> 2
  670. psrld $7, \TMP4 # packed left shift >> 7
  671. pxor \TMP3, \TMP2 # xor the shifted versions
  672. pxor \TMP4, \TMP2
  673. pxor \TMP7, \TMP2
  674. pxor \TMP2, \XMMDst
  675. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  676. .endm
  677. /* Encryption of a single block done*/
  678. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  679. pxor (%arg1), \XMM0
  680. movaps 16(%arg1), \TMP1
  681. AESENC \TMP1, \XMM0
  682. movaps 32(%arg1), \TMP1
  683. AESENC \TMP1, \XMM0
  684. movaps 48(%arg1), \TMP1
  685. AESENC \TMP1, \XMM0
  686. movaps 64(%arg1), \TMP1
  687. AESENC \TMP1, \XMM0
  688. movaps 80(%arg1), \TMP1
  689. AESENC \TMP1, \XMM0
  690. movaps 96(%arg1), \TMP1
  691. AESENC \TMP1, \XMM0
  692. movaps 112(%arg1), \TMP1
  693. AESENC \TMP1, \XMM0
  694. movaps 128(%arg1), \TMP1
  695. AESENC \TMP1, \XMM0
  696. movaps 144(%arg1), \TMP1
  697. AESENC \TMP1, \XMM0
  698. movaps 160(%arg1), \TMP1
  699. AESENCLAST \TMP1, \XMM0
  700. .endm
  701. /*****************************************************************************
  702. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  703. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  704. * const u8 *in, // Ciphertext input
  705. * u64 plaintext_len, // Length of data in bytes for decryption.
  706. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  707. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  708. * // concatenated with 0x00000001. 16-byte aligned pointer.
  709. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  710. * const u8 *aad, // Additional Authentication Data (AAD)
  711. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  712. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  713. * // given authentication tag and only return the plaintext if they match.
  714. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  715. * // (most likely), 12 or 8.
  716. *
  717. * Assumptions:
  718. *
  719. * keys:
  720. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  721. * set of 11 keys in the data structure void *aes_ctx
  722. *
  723. * iv:
  724. * 0 1 2 3
  725. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  726. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  727. * | Salt (From the SA) |
  728. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  729. * | Initialization Vector |
  730. * | (This is the sequence number from IPSec header) |
  731. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  732. * | 0x1 |
  733. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  734. *
  735. *
  736. *
  737. * AAD:
  738. * AAD padded to 128 bits with 0
  739. * for example, assume AAD is a u32 vector
  740. *
  741. * if AAD is 8 bytes:
  742. * AAD[3] = {A0, A1};
  743. * padded AAD in xmm register = {A1 A0 0 0}
  744. *
  745. * 0 1 2 3
  746. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  747. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  748. * | SPI (A1) |
  749. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  750. * | 32-bit Sequence Number (A0) |
  751. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  752. * | 0x0 |
  753. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  754. *
  755. * AAD Format with 32-bit Sequence Number
  756. *
  757. * if AAD is 12 bytes:
  758. * AAD[3] = {A0, A1, A2};
  759. * padded AAD in xmm register = {A2 A1 A0 0}
  760. *
  761. * 0 1 2 3
  762. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  763. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  764. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  765. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  766. * | SPI (A2) |
  767. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  768. * | 64-bit Extended Sequence Number {A1,A0} |
  769. * | |
  770. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  771. * | 0x0 |
  772. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  773. *
  774. * AAD Format with 64-bit Extended Sequence Number
  775. *
  776. * aadLen:
  777. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  778. * The code supports 16 too but for other sizes, the code will fail.
  779. *
  780. * TLen:
  781. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  782. * For other sizes, the code will fail.
  783. *
  784. * poly = x^128 + x^127 + x^126 + x^121 + 1
  785. *
  786. *****************************************************************************/
  787. ENTRY(aesni_gcm_dec)
  788. push %r12
  789. push %r13
  790. push %r14
  791. mov %rsp, %r14
  792. /*
  793. * states of %xmm registers %xmm6:%xmm15 not saved
  794. * all %xmm registers are clobbered
  795. */
  796. sub $VARIABLE_OFFSET, %rsp
  797. and $~63, %rsp # align rsp to 64 bytes
  798. mov %arg6, %r12
  799. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  800. pshufb SHUF_MASK(%rip), %xmm13
  801. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  802. movdqa %xmm13, %xmm2
  803. psllq $1, %xmm13
  804. psrlq $63, %xmm2
  805. movdqa %xmm2, %xmm1
  806. pslldq $8, %xmm2
  807. psrldq $8, %xmm1
  808. por %xmm2, %xmm13
  809. # Reduction
  810. pshufd $0x24, %xmm1, %xmm2
  811. pcmpeqd TWOONE(%rip), %xmm2
  812. pand POLY(%rip), %xmm2
  813. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  814. # Decrypt first few blocks
  815. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  816. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  817. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  818. mov %r13, %r12
  819. and $(3<<4), %r12
  820. jz _initial_num_blocks_is_0_decrypt
  821. cmp $(2<<4), %r12
  822. jb _initial_num_blocks_is_1_decrypt
  823. je _initial_num_blocks_is_2_decrypt
  824. _initial_num_blocks_is_3_decrypt:
  825. INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  826. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  827. sub $48, %r13
  828. jmp _initial_blocks_decrypted
  829. _initial_num_blocks_is_2_decrypt:
  830. INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  831. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  832. sub $32, %r13
  833. jmp _initial_blocks_decrypted
  834. _initial_num_blocks_is_1_decrypt:
  835. INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  836. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  837. sub $16, %r13
  838. jmp _initial_blocks_decrypted
  839. _initial_num_blocks_is_0_decrypt:
  840. INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  841. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  842. _initial_blocks_decrypted:
  843. cmp $0, %r13
  844. je _zero_cipher_left_decrypt
  845. sub $64, %r13
  846. je _four_cipher_left_decrypt
  847. _decrypt_by_4:
  848. GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  849. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  850. add $64, %r11
  851. sub $64, %r13
  852. jne _decrypt_by_4
  853. _four_cipher_left_decrypt:
  854. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  855. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  856. _zero_cipher_left_decrypt:
  857. mov %arg4, %r13
  858. and $15, %r13 # %r13 = arg4 (mod 16)
  859. je _multiple_of_16_bytes_decrypt
  860. # Handle the last <16 byte block seperately
  861. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  862. pshufb SHUF_MASK(%rip), %xmm0
  863. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  864. sub $16, %r11
  865. add %r13, %r11
  866. movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
  867. lea SHIFT_MASK+16(%rip), %r12
  868. sub %r13, %r12
  869. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  870. # (%r13 is the number of bytes in plaintext mod 16)
  871. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  872. pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
  873. movdqa %xmm1, %xmm2
  874. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  875. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  876. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  877. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  878. pand %xmm1, %xmm2
  879. pshufb SHUF_MASK(%rip),%xmm2
  880. pxor %xmm2, %xmm8
  881. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  882. # GHASH computation for the last <16 byte block
  883. sub %r13, %r11
  884. add $16, %r11
  885. # output %r13 bytes
  886. movq %xmm0, %rax
  887. cmp $8, %r13
  888. jle _less_than_8_bytes_left_decrypt
  889. mov %rax, (%arg2 , %r11, 1)
  890. add $8, %r11
  891. psrldq $8, %xmm0
  892. movq %xmm0, %rax
  893. sub $8, %r13
  894. _less_than_8_bytes_left_decrypt:
  895. mov %al, (%arg2, %r11, 1)
  896. add $1, %r11
  897. shr $8, %rax
  898. sub $1, %r13
  899. jne _less_than_8_bytes_left_decrypt
  900. _multiple_of_16_bytes_decrypt:
  901. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  902. shl $3, %r12 # convert into number of bits
  903. movd %r12d, %xmm15 # len(A) in %xmm15
  904. shl $3, %arg4 # len(C) in bits (*128)
  905. movq %arg4, %xmm1
  906. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  907. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  908. pxor %xmm15, %xmm8
  909. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  910. # final GHASH computation
  911. pshufb SHUF_MASK(%rip), %xmm8
  912. mov %arg5, %rax # %rax = *Y0
  913. movdqu (%rax), %xmm0 # %xmm0 = Y0
  914. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  915. pxor %xmm8, %xmm0
  916. _return_T_decrypt:
  917. mov arg9, %r10 # %r10 = authTag
  918. mov arg10, %r11 # %r11 = auth_tag_len
  919. cmp $16, %r11
  920. je _T_16_decrypt
  921. cmp $12, %r11
  922. je _T_12_decrypt
  923. _T_8_decrypt:
  924. movq %xmm0, %rax
  925. mov %rax, (%r10)
  926. jmp _return_T_done_decrypt
  927. _T_12_decrypt:
  928. movq %xmm0, %rax
  929. mov %rax, (%r10)
  930. psrldq $8, %xmm0
  931. movd %xmm0, %eax
  932. mov %eax, 8(%r10)
  933. jmp _return_T_done_decrypt
  934. _T_16_decrypt:
  935. movdqu %xmm0, (%r10)
  936. _return_T_done_decrypt:
  937. mov %r14, %rsp
  938. pop %r14
  939. pop %r13
  940. pop %r12
  941. ret
  942. /*****************************************************************************
  943. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  944. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  945. * const u8 *in, // Plaintext input
  946. * u64 plaintext_len, // Length of data in bytes for encryption.
  947. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  948. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  949. * // concatenated with 0x00000001. 16-byte aligned pointer.
  950. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  951. * const u8 *aad, // Additional Authentication Data (AAD)
  952. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  953. * u8 *auth_tag, // Authenticated Tag output.
  954. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  955. * // 12 or 8.
  956. *
  957. * Assumptions:
  958. *
  959. * keys:
  960. * keys are pre-expanded and aligned to 16 bytes. we are using the
  961. * first set of 11 keys in the data structure void *aes_ctx
  962. *
  963. *
  964. * iv:
  965. * 0 1 2 3
  966. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  967. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  968. * | Salt (From the SA) |
  969. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  970. * | Initialization Vector |
  971. * | (This is the sequence number from IPSec header) |
  972. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  973. * | 0x1 |
  974. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  975. *
  976. *
  977. *
  978. * AAD:
  979. * AAD padded to 128 bits with 0
  980. * for example, assume AAD is a u32 vector
  981. *
  982. * if AAD is 8 bytes:
  983. * AAD[3] = {A0, A1};
  984. * padded AAD in xmm register = {A1 A0 0 0}
  985. *
  986. * 0 1 2 3
  987. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  988. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  989. * | SPI (A1) |
  990. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  991. * | 32-bit Sequence Number (A0) |
  992. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  993. * | 0x0 |
  994. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  995. *
  996. * AAD Format with 32-bit Sequence Number
  997. *
  998. * if AAD is 12 bytes:
  999. * AAD[3] = {A0, A1, A2};
  1000. * padded AAD in xmm register = {A2 A1 A0 0}
  1001. *
  1002. * 0 1 2 3
  1003. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1004. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1005. * | SPI (A2) |
  1006. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1007. * | 64-bit Extended Sequence Number {A1,A0} |
  1008. * | |
  1009. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1010. * | 0x0 |
  1011. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1012. *
  1013. * AAD Format with 64-bit Extended Sequence Number
  1014. *
  1015. * aadLen:
  1016. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1017. * The code supports 16 too but for other sizes, the code will fail.
  1018. *
  1019. * TLen:
  1020. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1021. * For other sizes, the code will fail.
  1022. *
  1023. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1024. ***************************************************************************/
  1025. ENTRY(aesni_gcm_enc)
  1026. push %r12
  1027. push %r13
  1028. push %r14
  1029. mov %rsp, %r14
  1030. #
  1031. # states of %xmm registers %xmm6:%xmm15 not saved
  1032. # all %xmm registers are clobbered
  1033. #
  1034. sub $VARIABLE_OFFSET, %rsp
  1035. and $~63, %rsp
  1036. mov %arg6, %r12
  1037. movdqu (%r12), %xmm13
  1038. pshufb SHUF_MASK(%rip), %xmm13
  1039. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1040. movdqa %xmm13, %xmm2
  1041. psllq $1, %xmm13
  1042. psrlq $63, %xmm2
  1043. movdqa %xmm2, %xmm1
  1044. pslldq $8, %xmm2
  1045. psrldq $8, %xmm1
  1046. por %xmm2, %xmm13
  1047. # reduce HashKey<<1
  1048. pshufd $0x24, %xmm1, %xmm2
  1049. pcmpeqd TWOONE(%rip), %xmm2
  1050. pand POLY(%rip), %xmm2
  1051. pxor %xmm2, %xmm13
  1052. movdqa %xmm13, HashKey(%rsp)
  1053. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1054. and $-16, %r13
  1055. mov %r13, %r12
  1056. # Encrypt first few blocks
  1057. and $(3<<4), %r12
  1058. jz _initial_num_blocks_is_0_encrypt
  1059. cmp $(2<<4), %r12
  1060. jb _initial_num_blocks_is_1_encrypt
  1061. je _initial_num_blocks_is_2_encrypt
  1062. _initial_num_blocks_is_3_encrypt:
  1063. INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1064. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1065. sub $48, %r13
  1066. jmp _initial_blocks_encrypted
  1067. _initial_num_blocks_is_2_encrypt:
  1068. INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1069. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1070. sub $32, %r13
  1071. jmp _initial_blocks_encrypted
  1072. _initial_num_blocks_is_1_encrypt:
  1073. INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1074. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1075. sub $16, %r13
  1076. jmp _initial_blocks_encrypted
  1077. _initial_num_blocks_is_0_encrypt:
  1078. INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1079. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1080. _initial_blocks_encrypted:
  1081. # Main loop - Encrypt remaining blocks
  1082. cmp $0, %r13
  1083. je _zero_cipher_left_encrypt
  1084. sub $64, %r13
  1085. je _four_cipher_left_encrypt
  1086. _encrypt_by_4_encrypt:
  1087. GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1088. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1089. add $64, %r11
  1090. sub $64, %r13
  1091. jne _encrypt_by_4_encrypt
  1092. _four_cipher_left_encrypt:
  1093. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1094. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1095. _zero_cipher_left_encrypt:
  1096. mov %arg4, %r13
  1097. and $15, %r13 # %r13 = arg4 (mod 16)
  1098. je _multiple_of_16_bytes_encrypt
  1099. # Handle the last <16 Byte block seperately
  1100. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1101. pshufb SHUF_MASK(%rip), %xmm0
  1102. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1103. sub $16, %r11
  1104. add %r13, %r11
  1105. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1106. lea SHIFT_MASK+16(%rip), %r12
  1107. sub %r13, %r12
  1108. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1109. # (%r13 is the number of bytes in plaintext mod 16)
  1110. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1111. pshufb %xmm2, %xmm1 # shift right 16-r13 byte
  1112. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1113. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1114. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1115. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1116. pshufb SHUF_MASK(%rip),%xmm0
  1117. pxor %xmm0, %xmm8
  1118. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1119. # GHASH computation for the last <16 byte block
  1120. sub %r13, %r11
  1121. add $16, %r11
  1122. pshufb SHUF_MASK(%rip), %xmm0
  1123. # shuffle xmm0 back to output as ciphertext
  1124. # Output %r13 bytes
  1125. movq %xmm0, %rax
  1126. cmp $8, %r13
  1127. jle _less_than_8_bytes_left_encrypt
  1128. mov %rax, (%arg2 , %r11, 1)
  1129. add $8, %r11
  1130. psrldq $8, %xmm0
  1131. movq %xmm0, %rax
  1132. sub $8, %r13
  1133. _less_than_8_bytes_left_encrypt:
  1134. mov %al, (%arg2, %r11, 1)
  1135. add $1, %r11
  1136. shr $8, %rax
  1137. sub $1, %r13
  1138. jne _less_than_8_bytes_left_encrypt
  1139. _multiple_of_16_bytes_encrypt:
  1140. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1141. shl $3, %r12
  1142. movd %r12d, %xmm15 # len(A) in %xmm15
  1143. shl $3, %arg4 # len(C) in bits (*128)
  1144. movq %arg4, %xmm1
  1145. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1146. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1147. pxor %xmm15, %xmm8
  1148. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1149. # final GHASH computation
  1150. pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
  1151. mov %arg5, %rax # %rax = *Y0
  1152. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1153. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1154. pxor %xmm8, %xmm0
  1155. _return_T_encrypt:
  1156. mov arg9, %r10 # %r10 = authTag
  1157. mov arg10, %r11 # %r11 = auth_tag_len
  1158. cmp $16, %r11
  1159. je _T_16_encrypt
  1160. cmp $12, %r11
  1161. je _T_12_encrypt
  1162. _T_8_encrypt:
  1163. movq %xmm0, %rax
  1164. mov %rax, (%r10)
  1165. jmp _return_T_done_encrypt
  1166. _T_12_encrypt:
  1167. movq %xmm0, %rax
  1168. mov %rax, (%r10)
  1169. psrldq $8, %xmm0
  1170. movd %xmm0, %eax
  1171. mov %eax, 8(%r10)
  1172. jmp _return_T_done_encrypt
  1173. _T_16_encrypt:
  1174. movdqu %xmm0, (%r10)
  1175. _return_T_done_encrypt:
  1176. mov %r14, %rsp
  1177. pop %r14
  1178. pop %r13
  1179. pop %r12
  1180. ret
  1181. _key_expansion_128:
  1182. _key_expansion_256a:
  1183. pshufd $0b11111111, %xmm1, %xmm1
  1184. shufps $0b00010000, %xmm0, %xmm4
  1185. pxor %xmm4, %xmm0
  1186. shufps $0b10001100, %xmm0, %xmm4
  1187. pxor %xmm4, %xmm0
  1188. pxor %xmm1, %xmm0
  1189. movaps %xmm0, (TKEYP)
  1190. add $0x10, TKEYP
  1191. ret
  1192. .align 4
  1193. _key_expansion_192a:
  1194. pshufd $0b01010101, %xmm1, %xmm1
  1195. shufps $0b00010000, %xmm0, %xmm4
  1196. pxor %xmm4, %xmm0
  1197. shufps $0b10001100, %xmm0, %xmm4
  1198. pxor %xmm4, %xmm0
  1199. pxor %xmm1, %xmm0
  1200. movaps %xmm2, %xmm5
  1201. movaps %xmm2, %xmm6
  1202. pslldq $4, %xmm5
  1203. pshufd $0b11111111, %xmm0, %xmm3
  1204. pxor %xmm3, %xmm2
  1205. pxor %xmm5, %xmm2
  1206. movaps %xmm0, %xmm1
  1207. shufps $0b01000100, %xmm0, %xmm6
  1208. movaps %xmm6, (TKEYP)
  1209. shufps $0b01001110, %xmm2, %xmm1
  1210. movaps %xmm1, 0x10(TKEYP)
  1211. add $0x20, TKEYP
  1212. ret
  1213. .align 4
  1214. _key_expansion_192b:
  1215. pshufd $0b01010101, %xmm1, %xmm1
  1216. shufps $0b00010000, %xmm0, %xmm4
  1217. pxor %xmm4, %xmm0
  1218. shufps $0b10001100, %xmm0, %xmm4
  1219. pxor %xmm4, %xmm0
  1220. pxor %xmm1, %xmm0
  1221. movaps %xmm2, %xmm5
  1222. pslldq $4, %xmm5
  1223. pshufd $0b11111111, %xmm0, %xmm3
  1224. pxor %xmm3, %xmm2
  1225. pxor %xmm5, %xmm2
  1226. movaps %xmm0, (TKEYP)
  1227. add $0x10, TKEYP
  1228. ret
  1229. .align 4
  1230. _key_expansion_256b:
  1231. pshufd $0b10101010, %xmm1, %xmm1
  1232. shufps $0b00010000, %xmm2, %xmm4
  1233. pxor %xmm4, %xmm2
  1234. shufps $0b10001100, %xmm2, %xmm4
  1235. pxor %xmm4, %xmm2
  1236. pxor %xmm1, %xmm2
  1237. movaps %xmm2, (TKEYP)
  1238. add $0x10, TKEYP
  1239. ret
  1240. /*
  1241. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1242. * unsigned int key_len)
  1243. */
  1244. ENTRY(aesni_set_key)
  1245. #ifndef __x86_64__
  1246. pushl KEYP
  1247. movl 8(%esp), KEYP # ctx
  1248. movl 12(%esp), UKEYP # in_key
  1249. movl 16(%esp), %edx # key_len
  1250. #endif
  1251. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1252. movaps %xmm0, (KEYP)
  1253. lea 0x10(KEYP), TKEYP # key addr
  1254. movl %edx, 480(KEYP)
  1255. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1256. cmp $24, %dl
  1257. jb .Lenc_key128
  1258. je .Lenc_key192
  1259. movups 0x10(UKEYP), %xmm2 # other user key
  1260. movaps %xmm2, (TKEYP)
  1261. add $0x10, TKEYP
  1262. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1263. call _key_expansion_256a
  1264. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1265. call _key_expansion_256b
  1266. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1267. call _key_expansion_256a
  1268. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1269. call _key_expansion_256b
  1270. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1271. call _key_expansion_256a
  1272. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1273. call _key_expansion_256b
  1274. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1275. call _key_expansion_256a
  1276. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1277. call _key_expansion_256b
  1278. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1279. call _key_expansion_256a
  1280. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1281. call _key_expansion_256b
  1282. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1283. call _key_expansion_256a
  1284. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1285. call _key_expansion_256b
  1286. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1287. call _key_expansion_256a
  1288. jmp .Ldec_key
  1289. .Lenc_key192:
  1290. movq 0x10(UKEYP), %xmm2 # other user key
  1291. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1292. call _key_expansion_192a
  1293. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1294. call _key_expansion_192b
  1295. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1296. call _key_expansion_192a
  1297. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1298. call _key_expansion_192b
  1299. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1300. call _key_expansion_192a
  1301. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1302. call _key_expansion_192b
  1303. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1304. call _key_expansion_192a
  1305. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1306. call _key_expansion_192b
  1307. jmp .Ldec_key
  1308. .Lenc_key128:
  1309. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1310. call _key_expansion_128
  1311. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1312. call _key_expansion_128
  1313. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1314. call _key_expansion_128
  1315. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1316. call _key_expansion_128
  1317. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1318. call _key_expansion_128
  1319. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1320. call _key_expansion_128
  1321. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1322. call _key_expansion_128
  1323. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1324. call _key_expansion_128
  1325. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1326. call _key_expansion_128
  1327. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1328. call _key_expansion_128
  1329. .Ldec_key:
  1330. sub $0x10, TKEYP
  1331. movaps (KEYP), %xmm0
  1332. movaps (TKEYP), %xmm1
  1333. movaps %xmm0, 240(TKEYP)
  1334. movaps %xmm1, 240(KEYP)
  1335. add $0x10, KEYP
  1336. lea 240-16(TKEYP), UKEYP
  1337. .align 4
  1338. .Ldec_key_loop:
  1339. movaps (KEYP), %xmm0
  1340. AESIMC %xmm0 %xmm1
  1341. movaps %xmm1, (UKEYP)
  1342. add $0x10, KEYP
  1343. sub $0x10, UKEYP
  1344. cmp TKEYP, KEYP
  1345. jb .Ldec_key_loop
  1346. xor AREG, AREG
  1347. #ifndef __x86_64__
  1348. popl KEYP
  1349. #endif
  1350. ret
  1351. /*
  1352. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1353. */
  1354. ENTRY(aesni_enc)
  1355. #ifndef __x86_64__
  1356. pushl KEYP
  1357. pushl KLEN
  1358. movl 12(%esp), KEYP
  1359. movl 16(%esp), OUTP
  1360. movl 20(%esp), INP
  1361. #endif
  1362. movl 480(KEYP), KLEN # key length
  1363. movups (INP), STATE # input
  1364. call _aesni_enc1
  1365. movups STATE, (OUTP) # output
  1366. #ifndef __x86_64__
  1367. popl KLEN
  1368. popl KEYP
  1369. #endif
  1370. ret
  1371. /*
  1372. * _aesni_enc1: internal ABI
  1373. * input:
  1374. * KEYP: key struct pointer
  1375. * KLEN: round count
  1376. * STATE: initial state (input)
  1377. * output:
  1378. * STATE: finial state (output)
  1379. * changed:
  1380. * KEY
  1381. * TKEYP (T1)
  1382. */
  1383. .align 4
  1384. _aesni_enc1:
  1385. movaps (KEYP), KEY # key
  1386. mov KEYP, TKEYP
  1387. pxor KEY, STATE # round 0
  1388. add $0x30, TKEYP
  1389. cmp $24, KLEN
  1390. jb .Lenc128
  1391. lea 0x20(TKEYP), TKEYP
  1392. je .Lenc192
  1393. add $0x20, TKEYP
  1394. movaps -0x60(TKEYP), KEY
  1395. AESENC KEY STATE
  1396. movaps -0x50(TKEYP), KEY
  1397. AESENC KEY STATE
  1398. .align 4
  1399. .Lenc192:
  1400. movaps -0x40(TKEYP), KEY
  1401. AESENC KEY STATE
  1402. movaps -0x30(TKEYP), KEY
  1403. AESENC KEY STATE
  1404. .align 4
  1405. .Lenc128:
  1406. movaps -0x20(TKEYP), KEY
  1407. AESENC KEY STATE
  1408. movaps -0x10(TKEYP), KEY
  1409. AESENC KEY STATE
  1410. movaps (TKEYP), KEY
  1411. AESENC KEY STATE
  1412. movaps 0x10(TKEYP), KEY
  1413. AESENC KEY STATE
  1414. movaps 0x20(TKEYP), KEY
  1415. AESENC KEY STATE
  1416. movaps 0x30(TKEYP), KEY
  1417. AESENC KEY STATE
  1418. movaps 0x40(TKEYP), KEY
  1419. AESENC KEY STATE
  1420. movaps 0x50(TKEYP), KEY
  1421. AESENC KEY STATE
  1422. movaps 0x60(TKEYP), KEY
  1423. AESENC KEY STATE
  1424. movaps 0x70(TKEYP), KEY
  1425. AESENCLAST KEY STATE
  1426. ret
  1427. /*
  1428. * _aesni_enc4: internal ABI
  1429. * input:
  1430. * KEYP: key struct pointer
  1431. * KLEN: round count
  1432. * STATE1: initial state (input)
  1433. * STATE2
  1434. * STATE3
  1435. * STATE4
  1436. * output:
  1437. * STATE1: finial state (output)
  1438. * STATE2
  1439. * STATE3
  1440. * STATE4
  1441. * changed:
  1442. * KEY
  1443. * TKEYP (T1)
  1444. */
  1445. .align 4
  1446. _aesni_enc4:
  1447. movaps (KEYP), KEY # key
  1448. mov KEYP, TKEYP
  1449. pxor KEY, STATE1 # round 0
  1450. pxor KEY, STATE2
  1451. pxor KEY, STATE3
  1452. pxor KEY, STATE4
  1453. add $0x30, TKEYP
  1454. cmp $24, KLEN
  1455. jb .L4enc128
  1456. lea 0x20(TKEYP), TKEYP
  1457. je .L4enc192
  1458. add $0x20, TKEYP
  1459. movaps -0x60(TKEYP), KEY
  1460. AESENC KEY STATE1
  1461. AESENC KEY STATE2
  1462. AESENC KEY STATE3
  1463. AESENC KEY STATE4
  1464. movaps -0x50(TKEYP), KEY
  1465. AESENC KEY STATE1
  1466. AESENC KEY STATE2
  1467. AESENC KEY STATE3
  1468. AESENC KEY STATE4
  1469. #.align 4
  1470. .L4enc192:
  1471. movaps -0x40(TKEYP), KEY
  1472. AESENC KEY STATE1
  1473. AESENC KEY STATE2
  1474. AESENC KEY STATE3
  1475. AESENC KEY STATE4
  1476. movaps -0x30(TKEYP), KEY
  1477. AESENC KEY STATE1
  1478. AESENC KEY STATE2
  1479. AESENC KEY STATE3
  1480. AESENC KEY STATE4
  1481. #.align 4
  1482. .L4enc128:
  1483. movaps -0x20(TKEYP), KEY
  1484. AESENC KEY STATE1
  1485. AESENC KEY STATE2
  1486. AESENC KEY STATE3
  1487. AESENC KEY STATE4
  1488. movaps -0x10(TKEYP), KEY
  1489. AESENC KEY STATE1
  1490. AESENC KEY STATE2
  1491. AESENC KEY STATE3
  1492. AESENC KEY STATE4
  1493. movaps (TKEYP), KEY
  1494. AESENC KEY STATE1
  1495. AESENC KEY STATE2
  1496. AESENC KEY STATE3
  1497. AESENC KEY STATE4
  1498. movaps 0x10(TKEYP), KEY
  1499. AESENC KEY STATE1
  1500. AESENC KEY STATE2
  1501. AESENC KEY STATE3
  1502. AESENC KEY STATE4
  1503. movaps 0x20(TKEYP), KEY
  1504. AESENC KEY STATE1
  1505. AESENC KEY STATE2
  1506. AESENC KEY STATE3
  1507. AESENC KEY STATE4
  1508. movaps 0x30(TKEYP), KEY
  1509. AESENC KEY STATE1
  1510. AESENC KEY STATE2
  1511. AESENC KEY STATE3
  1512. AESENC KEY STATE4
  1513. movaps 0x40(TKEYP), KEY
  1514. AESENC KEY STATE1
  1515. AESENC KEY STATE2
  1516. AESENC KEY STATE3
  1517. AESENC KEY STATE4
  1518. movaps 0x50(TKEYP), KEY
  1519. AESENC KEY STATE1
  1520. AESENC KEY STATE2
  1521. AESENC KEY STATE3
  1522. AESENC KEY STATE4
  1523. movaps 0x60(TKEYP), KEY
  1524. AESENC KEY STATE1
  1525. AESENC KEY STATE2
  1526. AESENC KEY STATE3
  1527. AESENC KEY STATE4
  1528. movaps 0x70(TKEYP), KEY
  1529. AESENCLAST KEY STATE1 # last round
  1530. AESENCLAST KEY STATE2
  1531. AESENCLAST KEY STATE3
  1532. AESENCLAST KEY STATE4
  1533. ret
  1534. /*
  1535. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1536. */
  1537. ENTRY(aesni_dec)
  1538. #ifndef __x86_64__
  1539. pushl KEYP
  1540. pushl KLEN
  1541. movl 12(%esp), KEYP
  1542. movl 16(%esp), OUTP
  1543. movl 20(%esp), INP
  1544. #endif
  1545. mov 480(KEYP), KLEN # key length
  1546. add $240, KEYP
  1547. movups (INP), STATE # input
  1548. call _aesni_dec1
  1549. movups STATE, (OUTP) #output
  1550. #ifndef __x86_64__
  1551. popl KLEN
  1552. popl KEYP
  1553. #endif
  1554. ret
  1555. /*
  1556. * _aesni_dec1: internal ABI
  1557. * input:
  1558. * KEYP: key struct pointer
  1559. * KLEN: key length
  1560. * STATE: initial state (input)
  1561. * output:
  1562. * STATE: finial state (output)
  1563. * changed:
  1564. * KEY
  1565. * TKEYP (T1)
  1566. */
  1567. .align 4
  1568. _aesni_dec1:
  1569. movaps (KEYP), KEY # key
  1570. mov KEYP, TKEYP
  1571. pxor KEY, STATE # round 0
  1572. add $0x30, TKEYP
  1573. cmp $24, KLEN
  1574. jb .Ldec128
  1575. lea 0x20(TKEYP), TKEYP
  1576. je .Ldec192
  1577. add $0x20, TKEYP
  1578. movaps -0x60(TKEYP), KEY
  1579. AESDEC KEY STATE
  1580. movaps -0x50(TKEYP), KEY
  1581. AESDEC KEY STATE
  1582. .align 4
  1583. .Ldec192:
  1584. movaps -0x40(TKEYP), KEY
  1585. AESDEC KEY STATE
  1586. movaps -0x30(TKEYP), KEY
  1587. AESDEC KEY STATE
  1588. .align 4
  1589. .Ldec128:
  1590. movaps -0x20(TKEYP), KEY
  1591. AESDEC KEY STATE
  1592. movaps -0x10(TKEYP), KEY
  1593. AESDEC KEY STATE
  1594. movaps (TKEYP), KEY
  1595. AESDEC KEY STATE
  1596. movaps 0x10(TKEYP), KEY
  1597. AESDEC KEY STATE
  1598. movaps 0x20(TKEYP), KEY
  1599. AESDEC KEY STATE
  1600. movaps 0x30(TKEYP), KEY
  1601. AESDEC KEY STATE
  1602. movaps 0x40(TKEYP), KEY
  1603. AESDEC KEY STATE
  1604. movaps 0x50(TKEYP), KEY
  1605. AESDEC KEY STATE
  1606. movaps 0x60(TKEYP), KEY
  1607. AESDEC KEY STATE
  1608. movaps 0x70(TKEYP), KEY
  1609. AESDECLAST KEY STATE
  1610. ret
  1611. /*
  1612. * _aesni_dec4: internal ABI
  1613. * input:
  1614. * KEYP: key struct pointer
  1615. * KLEN: key length
  1616. * STATE1: initial state (input)
  1617. * STATE2
  1618. * STATE3
  1619. * STATE4
  1620. * output:
  1621. * STATE1: finial state (output)
  1622. * STATE2
  1623. * STATE3
  1624. * STATE4
  1625. * changed:
  1626. * KEY
  1627. * TKEYP (T1)
  1628. */
  1629. .align 4
  1630. _aesni_dec4:
  1631. movaps (KEYP), KEY # key
  1632. mov KEYP, TKEYP
  1633. pxor KEY, STATE1 # round 0
  1634. pxor KEY, STATE2
  1635. pxor KEY, STATE3
  1636. pxor KEY, STATE4
  1637. add $0x30, TKEYP
  1638. cmp $24, KLEN
  1639. jb .L4dec128
  1640. lea 0x20(TKEYP), TKEYP
  1641. je .L4dec192
  1642. add $0x20, TKEYP
  1643. movaps -0x60(TKEYP), KEY
  1644. AESDEC KEY STATE1
  1645. AESDEC KEY STATE2
  1646. AESDEC KEY STATE3
  1647. AESDEC KEY STATE4
  1648. movaps -0x50(TKEYP), KEY
  1649. AESDEC KEY STATE1
  1650. AESDEC KEY STATE2
  1651. AESDEC KEY STATE3
  1652. AESDEC KEY STATE4
  1653. .align 4
  1654. .L4dec192:
  1655. movaps -0x40(TKEYP), KEY
  1656. AESDEC KEY STATE1
  1657. AESDEC KEY STATE2
  1658. AESDEC KEY STATE3
  1659. AESDEC KEY STATE4
  1660. movaps -0x30(TKEYP), KEY
  1661. AESDEC KEY STATE1
  1662. AESDEC KEY STATE2
  1663. AESDEC KEY STATE3
  1664. AESDEC KEY STATE4
  1665. .align 4
  1666. .L4dec128:
  1667. movaps -0x20(TKEYP), KEY
  1668. AESDEC KEY STATE1
  1669. AESDEC KEY STATE2
  1670. AESDEC KEY STATE3
  1671. AESDEC KEY STATE4
  1672. movaps -0x10(TKEYP), KEY
  1673. AESDEC KEY STATE1
  1674. AESDEC KEY STATE2
  1675. AESDEC KEY STATE3
  1676. AESDEC KEY STATE4
  1677. movaps (TKEYP), KEY
  1678. AESDEC KEY STATE1
  1679. AESDEC KEY STATE2
  1680. AESDEC KEY STATE3
  1681. AESDEC KEY STATE4
  1682. movaps 0x10(TKEYP), KEY
  1683. AESDEC KEY STATE1
  1684. AESDEC KEY STATE2
  1685. AESDEC KEY STATE3
  1686. AESDEC KEY STATE4
  1687. movaps 0x20(TKEYP), KEY
  1688. AESDEC KEY STATE1
  1689. AESDEC KEY STATE2
  1690. AESDEC KEY STATE3
  1691. AESDEC KEY STATE4
  1692. movaps 0x30(TKEYP), KEY
  1693. AESDEC KEY STATE1
  1694. AESDEC KEY STATE2
  1695. AESDEC KEY STATE3
  1696. AESDEC KEY STATE4
  1697. movaps 0x40(TKEYP), KEY
  1698. AESDEC KEY STATE1
  1699. AESDEC KEY STATE2
  1700. AESDEC KEY STATE3
  1701. AESDEC KEY STATE4
  1702. movaps 0x50(TKEYP), KEY
  1703. AESDEC KEY STATE1
  1704. AESDEC KEY STATE2
  1705. AESDEC KEY STATE3
  1706. AESDEC KEY STATE4
  1707. movaps 0x60(TKEYP), KEY
  1708. AESDEC KEY STATE1
  1709. AESDEC KEY STATE2
  1710. AESDEC KEY STATE3
  1711. AESDEC KEY STATE4
  1712. movaps 0x70(TKEYP), KEY
  1713. AESDECLAST KEY STATE1 # last round
  1714. AESDECLAST KEY STATE2
  1715. AESDECLAST KEY STATE3
  1716. AESDECLAST KEY STATE4
  1717. ret
  1718. /*
  1719. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1720. * size_t len)
  1721. */
  1722. ENTRY(aesni_ecb_enc)
  1723. #ifndef __x86_64__
  1724. pushl LEN
  1725. pushl KEYP
  1726. pushl KLEN
  1727. movl 16(%esp), KEYP
  1728. movl 20(%esp), OUTP
  1729. movl 24(%esp), INP
  1730. movl 28(%esp), LEN
  1731. #endif
  1732. test LEN, LEN # check length
  1733. jz .Lecb_enc_ret
  1734. mov 480(KEYP), KLEN
  1735. cmp $16, LEN
  1736. jb .Lecb_enc_ret
  1737. cmp $64, LEN
  1738. jb .Lecb_enc_loop1
  1739. .align 4
  1740. .Lecb_enc_loop4:
  1741. movups (INP), STATE1
  1742. movups 0x10(INP), STATE2
  1743. movups 0x20(INP), STATE3
  1744. movups 0x30(INP), STATE4
  1745. call _aesni_enc4
  1746. movups STATE1, (OUTP)
  1747. movups STATE2, 0x10(OUTP)
  1748. movups STATE3, 0x20(OUTP)
  1749. movups STATE4, 0x30(OUTP)
  1750. sub $64, LEN
  1751. add $64, INP
  1752. add $64, OUTP
  1753. cmp $64, LEN
  1754. jge .Lecb_enc_loop4
  1755. cmp $16, LEN
  1756. jb .Lecb_enc_ret
  1757. .align 4
  1758. .Lecb_enc_loop1:
  1759. movups (INP), STATE1
  1760. call _aesni_enc1
  1761. movups STATE1, (OUTP)
  1762. sub $16, LEN
  1763. add $16, INP
  1764. add $16, OUTP
  1765. cmp $16, LEN
  1766. jge .Lecb_enc_loop1
  1767. .Lecb_enc_ret:
  1768. #ifndef __x86_64__
  1769. popl KLEN
  1770. popl KEYP
  1771. popl LEN
  1772. #endif
  1773. ret
  1774. /*
  1775. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1776. * size_t len);
  1777. */
  1778. ENTRY(aesni_ecb_dec)
  1779. #ifndef __x86_64__
  1780. pushl LEN
  1781. pushl KEYP
  1782. pushl KLEN
  1783. movl 16(%esp), KEYP
  1784. movl 20(%esp), OUTP
  1785. movl 24(%esp), INP
  1786. movl 28(%esp), LEN
  1787. #endif
  1788. test LEN, LEN
  1789. jz .Lecb_dec_ret
  1790. mov 480(KEYP), KLEN
  1791. add $240, KEYP
  1792. cmp $16, LEN
  1793. jb .Lecb_dec_ret
  1794. cmp $64, LEN
  1795. jb .Lecb_dec_loop1
  1796. .align 4
  1797. .Lecb_dec_loop4:
  1798. movups (INP), STATE1
  1799. movups 0x10(INP), STATE2
  1800. movups 0x20(INP), STATE3
  1801. movups 0x30(INP), STATE4
  1802. call _aesni_dec4
  1803. movups STATE1, (OUTP)
  1804. movups STATE2, 0x10(OUTP)
  1805. movups STATE3, 0x20(OUTP)
  1806. movups STATE4, 0x30(OUTP)
  1807. sub $64, LEN
  1808. add $64, INP
  1809. add $64, OUTP
  1810. cmp $64, LEN
  1811. jge .Lecb_dec_loop4
  1812. cmp $16, LEN
  1813. jb .Lecb_dec_ret
  1814. .align 4
  1815. .Lecb_dec_loop1:
  1816. movups (INP), STATE1
  1817. call _aesni_dec1
  1818. movups STATE1, (OUTP)
  1819. sub $16, LEN
  1820. add $16, INP
  1821. add $16, OUTP
  1822. cmp $16, LEN
  1823. jge .Lecb_dec_loop1
  1824. .Lecb_dec_ret:
  1825. #ifndef __x86_64__
  1826. popl KLEN
  1827. popl KEYP
  1828. popl LEN
  1829. #endif
  1830. ret
  1831. /*
  1832. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1833. * size_t len, u8 *iv)
  1834. */
  1835. ENTRY(aesni_cbc_enc)
  1836. #ifndef __x86_64__
  1837. pushl IVP
  1838. pushl LEN
  1839. pushl KEYP
  1840. pushl KLEN
  1841. movl 20(%esp), KEYP
  1842. movl 24(%esp), OUTP
  1843. movl 28(%esp), INP
  1844. movl 32(%esp), LEN
  1845. movl 36(%esp), IVP
  1846. #endif
  1847. cmp $16, LEN
  1848. jb .Lcbc_enc_ret
  1849. mov 480(KEYP), KLEN
  1850. movups (IVP), STATE # load iv as initial state
  1851. .align 4
  1852. .Lcbc_enc_loop:
  1853. movups (INP), IN # load input
  1854. pxor IN, STATE
  1855. call _aesni_enc1
  1856. movups STATE, (OUTP) # store output
  1857. sub $16, LEN
  1858. add $16, INP
  1859. add $16, OUTP
  1860. cmp $16, LEN
  1861. jge .Lcbc_enc_loop
  1862. movups STATE, (IVP)
  1863. .Lcbc_enc_ret:
  1864. #ifndef __x86_64__
  1865. popl KLEN
  1866. popl KEYP
  1867. popl LEN
  1868. popl IVP
  1869. #endif
  1870. ret
  1871. /*
  1872. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  1873. * size_t len, u8 *iv)
  1874. */
  1875. ENTRY(aesni_cbc_dec)
  1876. #ifndef __x86_64__
  1877. pushl IVP
  1878. pushl LEN
  1879. pushl KEYP
  1880. pushl KLEN
  1881. movl 20(%esp), KEYP
  1882. movl 24(%esp), OUTP
  1883. movl 28(%esp), INP
  1884. movl 32(%esp), LEN
  1885. movl 36(%esp), IVP
  1886. #endif
  1887. cmp $16, LEN
  1888. jb .Lcbc_dec_just_ret
  1889. mov 480(KEYP), KLEN
  1890. add $240, KEYP
  1891. movups (IVP), IV
  1892. cmp $64, LEN
  1893. jb .Lcbc_dec_loop1
  1894. .align 4
  1895. .Lcbc_dec_loop4:
  1896. movups (INP), IN1
  1897. movaps IN1, STATE1
  1898. movups 0x10(INP), IN2
  1899. movaps IN2, STATE2
  1900. #ifdef __x86_64__
  1901. movups 0x20(INP), IN3
  1902. movaps IN3, STATE3
  1903. movups 0x30(INP), IN4
  1904. movaps IN4, STATE4
  1905. #else
  1906. movups 0x20(INP), IN1
  1907. movaps IN1, STATE3
  1908. movups 0x30(INP), IN2
  1909. movaps IN2, STATE4
  1910. #endif
  1911. call _aesni_dec4
  1912. pxor IV, STATE1
  1913. #ifdef __x86_64__
  1914. pxor IN1, STATE2
  1915. pxor IN2, STATE3
  1916. pxor IN3, STATE4
  1917. movaps IN4, IV
  1918. #else
  1919. pxor (INP), STATE2
  1920. pxor 0x10(INP), STATE3
  1921. pxor IN1, STATE4
  1922. movaps IN2, IV
  1923. #endif
  1924. movups STATE1, (OUTP)
  1925. movups STATE2, 0x10(OUTP)
  1926. movups STATE3, 0x20(OUTP)
  1927. movups STATE4, 0x30(OUTP)
  1928. sub $64, LEN
  1929. add $64, INP
  1930. add $64, OUTP
  1931. cmp $64, LEN
  1932. jge .Lcbc_dec_loop4
  1933. cmp $16, LEN
  1934. jb .Lcbc_dec_ret
  1935. .align 4
  1936. .Lcbc_dec_loop1:
  1937. movups (INP), IN
  1938. movaps IN, STATE
  1939. call _aesni_dec1
  1940. pxor IV, STATE
  1941. movups STATE, (OUTP)
  1942. movaps IN, IV
  1943. sub $16, LEN
  1944. add $16, INP
  1945. add $16, OUTP
  1946. cmp $16, LEN
  1947. jge .Lcbc_dec_loop1
  1948. .Lcbc_dec_ret:
  1949. movups IV, (IVP)
  1950. .Lcbc_dec_just_ret:
  1951. #ifndef __x86_64__
  1952. popl KLEN
  1953. popl KEYP
  1954. popl LEN
  1955. popl IVP
  1956. #endif
  1957. ret
  1958. #ifdef __x86_64__
  1959. .align 16
  1960. .Lbswap_mask:
  1961. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  1962. /*
  1963. * _aesni_inc_init: internal ABI
  1964. * setup registers used by _aesni_inc
  1965. * input:
  1966. * IV
  1967. * output:
  1968. * CTR: == IV, in little endian
  1969. * TCTR_LOW: == lower qword of CTR
  1970. * INC: == 1, in little endian
  1971. * BSWAP_MASK == endian swapping mask
  1972. */
  1973. .align 4
  1974. _aesni_inc_init:
  1975. movaps .Lbswap_mask, BSWAP_MASK
  1976. movaps IV, CTR
  1977. PSHUFB_XMM BSWAP_MASK CTR
  1978. mov $1, TCTR_LOW
  1979. MOVQ_R64_XMM TCTR_LOW INC
  1980. MOVQ_R64_XMM CTR TCTR_LOW
  1981. ret
  1982. /*
  1983. * _aesni_inc: internal ABI
  1984. * Increase IV by 1, IV is in big endian
  1985. * input:
  1986. * IV
  1987. * CTR: == IV, in little endian
  1988. * TCTR_LOW: == lower qword of CTR
  1989. * INC: == 1, in little endian
  1990. * BSWAP_MASK == endian swapping mask
  1991. * output:
  1992. * IV: Increase by 1
  1993. * changed:
  1994. * CTR: == output IV, in little endian
  1995. * TCTR_LOW: == lower qword of CTR
  1996. */
  1997. .align 4
  1998. _aesni_inc:
  1999. paddq INC, CTR
  2000. add $1, TCTR_LOW
  2001. jnc .Linc_low
  2002. pslldq $8, INC
  2003. paddq INC, CTR
  2004. psrldq $8, INC
  2005. .Linc_low:
  2006. movaps CTR, IV
  2007. PSHUFB_XMM BSWAP_MASK IV
  2008. ret
  2009. /*
  2010. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2011. * size_t len, u8 *iv)
  2012. */
  2013. ENTRY(aesni_ctr_enc)
  2014. cmp $16, LEN
  2015. jb .Lctr_enc_just_ret
  2016. mov 480(KEYP), KLEN
  2017. movups (IVP), IV
  2018. call _aesni_inc_init
  2019. cmp $64, LEN
  2020. jb .Lctr_enc_loop1
  2021. .align 4
  2022. .Lctr_enc_loop4:
  2023. movaps IV, STATE1
  2024. call _aesni_inc
  2025. movups (INP), IN1
  2026. movaps IV, STATE2
  2027. call _aesni_inc
  2028. movups 0x10(INP), IN2
  2029. movaps IV, STATE3
  2030. call _aesni_inc
  2031. movups 0x20(INP), IN3
  2032. movaps IV, STATE4
  2033. call _aesni_inc
  2034. movups 0x30(INP), IN4
  2035. call _aesni_enc4
  2036. pxor IN1, STATE1
  2037. movups STATE1, (OUTP)
  2038. pxor IN2, STATE2
  2039. movups STATE2, 0x10(OUTP)
  2040. pxor IN3, STATE3
  2041. movups STATE3, 0x20(OUTP)
  2042. pxor IN4, STATE4
  2043. movups STATE4, 0x30(OUTP)
  2044. sub $64, LEN
  2045. add $64, INP
  2046. add $64, OUTP
  2047. cmp $64, LEN
  2048. jge .Lctr_enc_loop4
  2049. cmp $16, LEN
  2050. jb .Lctr_enc_ret
  2051. .align 4
  2052. .Lctr_enc_loop1:
  2053. movaps IV, STATE
  2054. call _aesni_inc
  2055. movups (INP), IN
  2056. call _aesni_enc1
  2057. pxor IN, STATE
  2058. movups STATE, (OUTP)
  2059. sub $16, LEN
  2060. add $16, INP
  2061. add $16, OUTP
  2062. cmp $16, LEN
  2063. jge .Lctr_enc_loop1
  2064. .Lctr_enc_ret:
  2065. movups IV, (IVP)
  2066. .Lctr_enc_just_ret:
  2067. ret
  2068. #endif