aesni-intel_asm.S 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772
  1. /*
  2. * Implement AES algorithm in Intel AES-NI instructions.
  3. *
  4. * The white paper of AES-NI instructions can be downloaded from:
  5. * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
  6. *
  7. * Copyright (C) 2008, Intel Corp.
  8. * Author: Huang Ying <ying.huang@intel.com>
  9. * Vinodh Gopal <vinodh.gopal@intel.com>
  10. * Kahraman Akdemir
  11. *
  12. * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13. * interface for 64-bit kernels.
  14. * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15. * Aidan O'Mahony (aidan.o.mahony@intel.com)
  16. * Adrian Hoban <adrian.hoban@intel.com>
  17. * James Guilford (james.guilford@intel.com)
  18. * Gabriele Paoloni <gabriele.paoloni@intel.com>
  19. * Tadeusz Struk (tadeusz.struk@intel.com)
  20. * Wajdi Feghali (wajdi.k.feghali@intel.com)
  21. * Copyright (c) 2010, Intel Corporation.
  22. *
  23. * Ported x86_64 version to x86:
  24. * Author: Mathias Krause <minipli@googlemail.com>
  25. *
  26. * This program is free software; you can redistribute it and/or modify
  27. * it under the terms of the GNU General Public License as published by
  28. * the Free Software Foundation; either version 2 of the License, or
  29. * (at your option) any later version.
  30. */
  31. #include <linux/linkage.h>
  32. #include <asm/inst.h>
  33. #ifdef __x86_64__
  34. .data
  35. .align 16
  36. .Lgf128mul_x_ble_mask:
  37. .octa 0x00000000000000010000000000000087
  38. POLY: .octa 0xC2000000000000000000000000000001
  39. TWOONE: .octa 0x00000001000000000000000000000001
  40. # order of these constants should not change.
  41. # more specifically, ALL_F should follow SHIFT_MASK,
  42. # and ZERO should follow ALL_F
  43. SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
  44. MASK1: .octa 0x0000000000000000ffffffffffffffff
  45. MASK2: .octa 0xffffffffffffffff0000000000000000
  46. SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  47. ALL_F: .octa 0xffffffffffffffffffffffffffffffff
  48. ZERO: .octa 0x00000000000000000000000000000000
  49. ONE: .octa 0x00000000000000000000000000000001
  50. F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  51. dec: .octa 0x1
  52. enc: .octa 0x2
  53. .text
  54. #define STACK_OFFSET 8*3
  55. #define HashKey 16*0 // store HashKey <<1 mod poly here
  56. #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
  57. #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
  58. #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
  59. #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
  60. // bits of HashKey <<1 mod poly here
  61. //(for Karatsuba purposes)
  62. #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
  63. // bits of HashKey^2 <<1 mod poly here
  64. // (for Karatsuba purposes)
  65. #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
  66. // bits of HashKey^3 <<1 mod poly here
  67. // (for Karatsuba purposes)
  68. #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
  69. // bits of HashKey^4 <<1 mod poly here
  70. // (for Karatsuba purposes)
  71. #define VARIABLE_OFFSET 16*8
  72. #define arg1 rdi
  73. #define arg2 rsi
  74. #define arg3 rdx
  75. #define arg4 rcx
  76. #define arg5 r8
  77. #define arg6 r9
  78. #define arg7 STACK_OFFSET+8(%r14)
  79. #define arg8 STACK_OFFSET+16(%r14)
  80. #define arg9 STACK_OFFSET+24(%r14)
  81. #define arg10 STACK_OFFSET+32(%r14)
  82. #endif
  83. #define STATE1 %xmm0
  84. #define STATE2 %xmm4
  85. #define STATE3 %xmm5
  86. #define STATE4 %xmm6
  87. #define STATE STATE1
  88. #define IN1 %xmm1
  89. #define IN2 %xmm7
  90. #define IN3 %xmm8
  91. #define IN4 %xmm9
  92. #define IN IN1
  93. #define KEY %xmm2
  94. #define IV %xmm3
  95. #define BSWAP_MASK %xmm10
  96. #define CTR %xmm11
  97. #define INC %xmm12
  98. #define GF128MUL_MASK %xmm10
  99. #ifdef __x86_64__
  100. #define AREG %rax
  101. #define KEYP %rdi
  102. #define OUTP %rsi
  103. #define UKEYP OUTP
  104. #define INP %rdx
  105. #define LEN %rcx
  106. #define IVP %r8
  107. #define KLEN %r9d
  108. #define T1 %r10
  109. #define TKEYP T1
  110. #define T2 %r11
  111. #define TCTR_LOW T2
  112. #else
  113. #define AREG %eax
  114. #define KEYP %edi
  115. #define OUTP AREG
  116. #define UKEYP OUTP
  117. #define INP %edx
  118. #define LEN %esi
  119. #define IVP %ebp
  120. #define KLEN %ebx
  121. #define T1 %ecx
  122. #define TKEYP T1
  123. #endif
  124. #ifdef __x86_64__
  125. /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
  126. *
  127. *
  128. * Input: A and B (128-bits each, bit-reflected)
  129. * Output: C = A*B*x mod poly, (i.e. >>1 )
  130. * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
  131. * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
  132. *
  133. */
  134. .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
  135. movdqa \GH, \TMP1
  136. pshufd $78, \GH, \TMP2
  137. pshufd $78, \HK, \TMP3
  138. pxor \GH, \TMP2 # TMP2 = a1+a0
  139. pxor \HK, \TMP3 # TMP3 = b1+b0
  140. PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
  141. PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
  142. PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
  143. pxor \GH, \TMP2
  144. pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
  145. movdqa \TMP2, \TMP3
  146. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  147. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  148. pxor \TMP3, \GH
  149. pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
  150. # first phase of the reduction
  151. movdqa \GH, \TMP2
  152. movdqa \GH, \TMP3
  153. movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
  154. # in in order to perform
  155. # independent shifts
  156. pslld $31, \TMP2 # packed right shift <<31
  157. pslld $30, \TMP3 # packed right shift <<30
  158. pslld $25, \TMP4 # packed right shift <<25
  159. pxor \TMP3, \TMP2 # xor the shifted versions
  160. pxor \TMP4, \TMP2
  161. movdqa \TMP2, \TMP5
  162. psrldq $4, \TMP5 # right shift TMP5 1 DW
  163. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  164. pxor \TMP2, \GH
  165. # second phase of the reduction
  166. movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
  167. # in in order to perform
  168. # independent shifts
  169. movdqa \GH,\TMP3
  170. movdqa \GH,\TMP4
  171. psrld $1,\TMP2 # packed left shift >>1
  172. psrld $2,\TMP3 # packed left shift >>2
  173. psrld $7,\TMP4 # packed left shift >>7
  174. pxor \TMP3,\TMP2 # xor the shifted versions
  175. pxor \TMP4,\TMP2
  176. pxor \TMP5, \TMP2
  177. pxor \TMP2, \GH
  178. pxor \TMP1, \GH # result is in TMP1
  179. .endm
  180. /*
  181. * if a = number of total plaintext bytes
  182. * b = floor(a/16)
  183. * num_initial_blocks = b mod 4
  184. * encrypt the initial num_initial_blocks blocks and apply ghash on
  185. * the ciphertext
  186. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  187. * are clobbered
  188. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  189. */
  190. .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  191. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  192. mov arg7, %r10 # %r10 = AAD
  193. mov arg8, %r12 # %r12 = aadLen
  194. mov %r12, %r11
  195. pxor %xmm\i, %xmm\i
  196. _get_AAD_loop\num_initial_blocks\operation:
  197. movd (%r10), \TMP1
  198. pslldq $12, \TMP1
  199. psrldq $4, %xmm\i
  200. pxor \TMP1, %xmm\i
  201. add $4, %r10
  202. sub $4, %r12
  203. jne _get_AAD_loop\num_initial_blocks\operation
  204. cmp $16, %r11
  205. je _get_AAD_loop2_done\num_initial_blocks\operation
  206. mov $16, %r12
  207. _get_AAD_loop2\num_initial_blocks\operation:
  208. psrldq $4, %xmm\i
  209. sub $4, %r12
  210. cmp %r11, %r12
  211. jne _get_AAD_loop2\num_initial_blocks\operation
  212. _get_AAD_loop2_done\num_initial_blocks\operation:
  213. movdqa SHUF_MASK(%rip), %xmm14
  214. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  215. xor %r11, %r11 # initialise the data pointer offset as zero
  216. # start AES for num_initial_blocks blocks
  217. mov %arg5, %rax # %rax = *Y0
  218. movdqu (%rax), \XMM0 # XMM0 = Y0
  219. movdqa SHUF_MASK(%rip), %xmm14
  220. PSHUFB_XMM %xmm14, \XMM0
  221. .if (\i == 5) || (\i == 6) || (\i == 7)
  222. .irpc index, \i_seq
  223. paddd ONE(%rip), \XMM0 # INCR Y0
  224. movdqa \XMM0, %xmm\index
  225. movdqa SHUF_MASK(%rip), %xmm14
  226. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  227. .endr
  228. .irpc index, \i_seq
  229. pxor 16*0(%arg1), %xmm\index
  230. .endr
  231. .irpc index, \i_seq
  232. movaps 0x10(%rdi), \TMP1
  233. AESENC \TMP1, %xmm\index # Round 1
  234. .endr
  235. .irpc index, \i_seq
  236. movaps 0x20(%arg1), \TMP1
  237. AESENC \TMP1, %xmm\index # Round 2
  238. .endr
  239. .irpc index, \i_seq
  240. movaps 0x30(%arg1), \TMP1
  241. AESENC \TMP1, %xmm\index # Round 2
  242. .endr
  243. .irpc index, \i_seq
  244. movaps 0x40(%arg1), \TMP1
  245. AESENC \TMP1, %xmm\index # Round 2
  246. .endr
  247. .irpc index, \i_seq
  248. movaps 0x50(%arg1), \TMP1
  249. AESENC \TMP1, %xmm\index # Round 2
  250. .endr
  251. .irpc index, \i_seq
  252. movaps 0x60(%arg1), \TMP1
  253. AESENC \TMP1, %xmm\index # Round 2
  254. .endr
  255. .irpc index, \i_seq
  256. movaps 0x70(%arg1), \TMP1
  257. AESENC \TMP1, %xmm\index # Round 2
  258. .endr
  259. .irpc index, \i_seq
  260. movaps 0x80(%arg1), \TMP1
  261. AESENC \TMP1, %xmm\index # Round 2
  262. .endr
  263. .irpc index, \i_seq
  264. movaps 0x90(%arg1), \TMP1
  265. AESENC \TMP1, %xmm\index # Round 2
  266. .endr
  267. .irpc index, \i_seq
  268. movaps 0xa0(%arg1), \TMP1
  269. AESENCLAST \TMP1, %xmm\index # Round 10
  270. .endr
  271. .irpc index, \i_seq
  272. movdqu (%arg3 , %r11, 1), \TMP1
  273. pxor \TMP1, %xmm\index
  274. movdqu %xmm\index, (%arg2 , %r11, 1)
  275. # write back plaintext/ciphertext for num_initial_blocks
  276. add $16, %r11
  277. movdqa \TMP1, %xmm\index
  278. movdqa SHUF_MASK(%rip), %xmm14
  279. PSHUFB_XMM %xmm14, %xmm\index
  280. # prepare plaintext/ciphertext for GHASH computation
  281. .endr
  282. .endif
  283. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  284. # apply GHASH on num_initial_blocks blocks
  285. .if \i == 5
  286. pxor %xmm5, %xmm6
  287. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  288. pxor %xmm6, %xmm7
  289. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  290. pxor %xmm7, %xmm8
  291. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  292. .elseif \i == 6
  293. pxor %xmm6, %xmm7
  294. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  295. pxor %xmm7, %xmm8
  296. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  297. .elseif \i == 7
  298. pxor %xmm7, %xmm8
  299. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  300. .endif
  301. cmp $64, %r13
  302. jl _initial_blocks_done\num_initial_blocks\operation
  303. # no need for precomputed values
  304. /*
  305. *
  306. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  307. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  308. */
  309. paddd ONE(%rip), \XMM0 # INCR Y0
  310. movdqa \XMM0, \XMM1
  311. movdqa SHUF_MASK(%rip), %xmm14
  312. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  313. paddd ONE(%rip), \XMM0 # INCR Y0
  314. movdqa \XMM0, \XMM2
  315. movdqa SHUF_MASK(%rip), %xmm14
  316. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  317. paddd ONE(%rip), \XMM0 # INCR Y0
  318. movdqa \XMM0, \XMM3
  319. movdqa SHUF_MASK(%rip), %xmm14
  320. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  321. paddd ONE(%rip), \XMM0 # INCR Y0
  322. movdqa \XMM0, \XMM4
  323. movdqa SHUF_MASK(%rip), %xmm14
  324. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  325. pxor 16*0(%arg1), \XMM1
  326. pxor 16*0(%arg1), \XMM2
  327. pxor 16*0(%arg1), \XMM3
  328. pxor 16*0(%arg1), \XMM4
  329. movdqa \TMP3, \TMP5
  330. pshufd $78, \TMP3, \TMP1
  331. pxor \TMP3, \TMP1
  332. movdqa \TMP1, HashKey_k(%rsp)
  333. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  334. # TMP5 = HashKey^2<<1 (mod poly)
  335. movdqa \TMP5, HashKey_2(%rsp)
  336. # HashKey_2 = HashKey^2<<1 (mod poly)
  337. pshufd $78, \TMP5, \TMP1
  338. pxor \TMP5, \TMP1
  339. movdqa \TMP1, HashKey_2_k(%rsp)
  340. .irpc index, 1234 # do 4 rounds
  341. movaps 0x10*\index(%arg1), \TMP1
  342. AESENC \TMP1, \XMM1
  343. AESENC \TMP1, \XMM2
  344. AESENC \TMP1, \XMM3
  345. AESENC \TMP1, \XMM4
  346. .endr
  347. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  348. # TMP5 = HashKey^3<<1 (mod poly)
  349. movdqa \TMP5, HashKey_3(%rsp)
  350. pshufd $78, \TMP5, \TMP1
  351. pxor \TMP5, \TMP1
  352. movdqa \TMP1, HashKey_3_k(%rsp)
  353. .irpc index, 56789 # do next 5 rounds
  354. movaps 0x10*\index(%arg1), \TMP1
  355. AESENC \TMP1, \XMM1
  356. AESENC \TMP1, \XMM2
  357. AESENC \TMP1, \XMM3
  358. AESENC \TMP1, \XMM4
  359. .endr
  360. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  361. # TMP5 = HashKey^3<<1 (mod poly)
  362. movdqa \TMP5, HashKey_4(%rsp)
  363. pshufd $78, \TMP5, \TMP1
  364. pxor \TMP5, \TMP1
  365. movdqa \TMP1, HashKey_4_k(%rsp)
  366. movaps 0xa0(%arg1), \TMP2
  367. AESENCLAST \TMP2, \XMM1
  368. AESENCLAST \TMP2, \XMM2
  369. AESENCLAST \TMP2, \XMM3
  370. AESENCLAST \TMP2, \XMM4
  371. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  372. pxor \TMP1, \XMM1
  373. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  374. movdqa \TMP1, \XMM1
  375. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  376. pxor \TMP1, \XMM2
  377. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  378. movdqa \TMP1, \XMM2
  379. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  380. pxor \TMP1, \XMM3
  381. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  382. movdqa \TMP1, \XMM3
  383. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  384. pxor \TMP1, \XMM4
  385. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  386. movdqa \TMP1, \XMM4
  387. add $64, %r11
  388. movdqa SHUF_MASK(%rip), %xmm14
  389. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  390. pxor \XMMDst, \XMM1
  391. # combine GHASHed value with the corresponding ciphertext
  392. movdqa SHUF_MASK(%rip), %xmm14
  393. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  394. movdqa SHUF_MASK(%rip), %xmm14
  395. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  396. movdqa SHUF_MASK(%rip), %xmm14
  397. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  398. _initial_blocks_done\num_initial_blocks\operation:
  399. .endm
  400. /*
  401. * if a = number of total plaintext bytes
  402. * b = floor(a/16)
  403. * num_initial_blocks = b mod 4
  404. * encrypt the initial num_initial_blocks blocks and apply ghash on
  405. * the ciphertext
  406. * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
  407. * are clobbered
  408. * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
  409. */
  410. .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
  411. XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
  412. mov arg7, %r10 # %r10 = AAD
  413. mov arg8, %r12 # %r12 = aadLen
  414. mov %r12, %r11
  415. pxor %xmm\i, %xmm\i
  416. _get_AAD_loop\num_initial_blocks\operation:
  417. movd (%r10), \TMP1
  418. pslldq $12, \TMP1
  419. psrldq $4, %xmm\i
  420. pxor \TMP1, %xmm\i
  421. add $4, %r10
  422. sub $4, %r12
  423. jne _get_AAD_loop\num_initial_blocks\operation
  424. cmp $16, %r11
  425. je _get_AAD_loop2_done\num_initial_blocks\operation
  426. mov $16, %r12
  427. _get_AAD_loop2\num_initial_blocks\operation:
  428. psrldq $4, %xmm\i
  429. sub $4, %r12
  430. cmp %r11, %r12
  431. jne _get_AAD_loop2\num_initial_blocks\operation
  432. _get_AAD_loop2_done\num_initial_blocks\operation:
  433. movdqa SHUF_MASK(%rip), %xmm14
  434. PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
  435. xor %r11, %r11 # initialise the data pointer offset as zero
  436. # start AES for num_initial_blocks blocks
  437. mov %arg5, %rax # %rax = *Y0
  438. movdqu (%rax), \XMM0 # XMM0 = Y0
  439. movdqa SHUF_MASK(%rip), %xmm14
  440. PSHUFB_XMM %xmm14, \XMM0
  441. .if (\i == 5) || (\i == 6) || (\i == 7)
  442. .irpc index, \i_seq
  443. paddd ONE(%rip), \XMM0 # INCR Y0
  444. movdqa \XMM0, %xmm\index
  445. movdqa SHUF_MASK(%rip), %xmm14
  446. PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
  447. .endr
  448. .irpc index, \i_seq
  449. pxor 16*0(%arg1), %xmm\index
  450. .endr
  451. .irpc index, \i_seq
  452. movaps 0x10(%rdi), \TMP1
  453. AESENC \TMP1, %xmm\index # Round 1
  454. .endr
  455. .irpc index, \i_seq
  456. movaps 0x20(%arg1), \TMP1
  457. AESENC \TMP1, %xmm\index # Round 2
  458. .endr
  459. .irpc index, \i_seq
  460. movaps 0x30(%arg1), \TMP1
  461. AESENC \TMP1, %xmm\index # Round 2
  462. .endr
  463. .irpc index, \i_seq
  464. movaps 0x40(%arg1), \TMP1
  465. AESENC \TMP1, %xmm\index # Round 2
  466. .endr
  467. .irpc index, \i_seq
  468. movaps 0x50(%arg1), \TMP1
  469. AESENC \TMP1, %xmm\index # Round 2
  470. .endr
  471. .irpc index, \i_seq
  472. movaps 0x60(%arg1), \TMP1
  473. AESENC \TMP1, %xmm\index # Round 2
  474. .endr
  475. .irpc index, \i_seq
  476. movaps 0x70(%arg1), \TMP1
  477. AESENC \TMP1, %xmm\index # Round 2
  478. .endr
  479. .irpc index, \i_seq
  480. movaps 0x80(%arg1), \TMP1
  481. AESENC \TMP1, %xmm\index # Round 2
  482. .endr
  483. .irpc index, \i_seq
  484. movaps 0x90(%arg1), \TMP1
  485. AESENC \TMP1, %xmm\index # Round 2
  486. .endr
  487. .irpc index, \i_seq
  488. movaps 0xa0(%arg1), \TMP1
  489. AESENCLAST \TMP1, %xmm\index # Round 10
  490. .endr
  491. .irpc index, \i_seq
  492. movdqu (%arg3 , %r11, 1), \TMP1
  493. pxor \TMP1, %xmm\index
  494. movdqu %xmm\index, (%arg2 , %r11, 1)
  495. # write back plaintext/ciphertext for num_initial_blocks
  496. add $16, %r11
  497. movdqa SHUF_MASK(%rip), %xmm14
  498. PSHUFB_XMM %xmm14, %xmm\index
  499. # prepare plaintext/ciphertext for GHASH computation
  500. .endr
  501. .endif
  502. GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  503. # apply GHASH on num_initial_blocks blocks
  504. .if \i == 5
  505. pxor %xmm5, %xmm6
  506. GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  507. pxor %xmm6, %xmm7
  508. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  509. pxor %xmm7, %xmm8
  510. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  511. .elseif \i == 6
  512. pxor %xmm6, %xmm7
  513. GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  514. pxor %xmm7, %xmm8
  515. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  516. .elseif \i == 7
  517. pxor %xmm7, %xmm8
  518. GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
  519. .endif
  520. cmp $64, %r13
  521. jl _initial_blocks_done\num_initial_blocks\operation
  522. # no need for precomputed values
  523. /*
  524. *
  525. * Precomputations for HashKey parallel with encryption of first 4 blocks.
  526. * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
  527. */
  528. paddd ONE(%rip), \XMM0 # INCR Y0
  529. movdqa \XMM0, \XMM1
  530. movdqa SHUF_MASK(%rip), %xmm14
  531. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  532. paddd ONE(%rip), \XMM0 # INCR Y0
  533. movdqa \XMM0, \XMM2
  534. movdqa SHUF_MASK(%rip), %xmm14
  535. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  536. paddd ONE(%rip), \XMM0 # INCR Y0
  537. movdqa \XMM0, \XMM3
  538. movdqa SHUF_MASK(%rip), %xmm14
  539. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  540. paddd ONE(%rip), \XMM0 # INCR Y0
  541. movdqa \XMM0, \XMM4
  542. movdqa SHUF_MASK(%rip), %xmm14
  543. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  544. pxor 16*0(%arg1), \XMM1
  545. pxor 16*0(%arg1), \XMM2
  546. pxor 16*0(%arg1), \XMM3
  547. pxor 16*0(%arg1), \XMM4
  548. movdqa \TMP3, \TMP5
  549. pshufd $78, \TMP3, \TMP1
  550. pxor \TMP3, \TMP1
  551. movdqa \TMP1, HashKey_k(%rsp)
  552. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  553. # TMP5 = HashKey^2<<1 (mod poly)
  554. movdqa \TMP5, HashKey_2(%rsp)
  555. # HashKey_2 = HashKey^2<<1 (mod poly)
  556. pshufd $78, \TMP5, \TMP1
  557. pxor \TMP5, \TMP1
  558. movdqa \TMP1, HashKey_2_k(%rsp)
  559. .irpc index, 1234 # do 4 rounds
  560. movaps 0x10*\index(%arg1), \TMP1
  561. AESENC \TMP1, \XMM1
  562. AESENC \TMP1, \XMM2
  563. AESENC \TMP1, \XMM3
  564. AESENC \TMP1, \XMM4
  565. .endr
  566. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  567. # TMP5 = HashKey^3<<1 (mod poly)
  568. movdqa \TMP5, HashKey_3(%rsp)
  569. pshufd $78, \TMP5, \TMP1
  570. pxor \TMP5, \TMP1
  571. movdqa \TMP1, HashKey_3_k(%rsp)
  572. .irpc index, 56789 # do next 5 rounds
  573. movaps 0x10*\index(%arg1), \TMP1
  574. AESENC \TMP1, \XMM1
  575. AESENC \TMP1, \XMM2
  576. AESENC \TMP1, \XMM3
  577. AESENC \TMP1, \XMM4
  578. .endr
  579. GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
  580. # TMP5 = HashKey^3<<1 (mod poly)
  581. movdqa \TMP5, HashKey_4(%rsp)
  582. pshufd $78, \TMP5, \TMP1
  583. pxor \TMP5, \TMP1
  584. movdqa \TMP1, HashKey_4_k(%rsp)
  585. movaps 0xa0(%arg1), \TMP2
  586. AESENCLAST \TMP2, \XMM1
  587. AESENCLAST \TMP2, \XMM2
  588. AESENCLAST \TMP2, \XMM3
  589. AESENCLAST \TMP2, \XMM4
  590. movdqu 16*0(%arg3 , %r11 , 1), \TMP1
  591. pxor \TMP1, \XMM1
  592. movdqu 16*1(%arg3 , %r11 , 1), \TMP1
  593. pxor \TMP1, \XMM2
  594. movdqu 16*2(%arg3 , %r11 , 1), \TMP1
  595. pxor \TMP1, \XMM3
  596. movdqu 16*3(%arg3 , %r11 , 1), \TMP1
  597. pxor \TMP1, \XMM4
  598. movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
  599. movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
  600. movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
  601. movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
  602. add $64, %r11
  603. movdqa SHUF_MASK(%rip), %xmm14
  604. PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
  605. pxor \XMMDst, \XMM1
  606. # combine GHASHed value with the corresponding ciphertext
  607. movdqa SHUF_MASK(%rip), %xmm14
  608. PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
  609. movdqa SHUF_MASK(%rip), %xmm14
  610. PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
  611. movdqa SHUF_MASK(%rip), %xmm14
  612. PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
  613. _initial_blocks_done\num_initial_blocks\operation:
  614. .endm
  615. /*
  616. * encrypt 4 blocks at a time
  617. * ghash the 4 previously encrypted ciphertext blocks
  618. * arg1, %arg2, %arg3 are used as pointers only, not modified
  619. * %r11 is the data offset value
  620. */
  621. .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
  622. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  623. movdqa \XMM1, \XMM5
  624. movdqa \XMM2, \XMM6
  625. movdqa \XMM3, \XMM7
  626. movdqa \XMM4, \XMM8
  627. movdqa SHUF_MASK(%rip), %xmm15
  628. # multiply TMP5 * HashKey using karatsuba
  629. movdqa \XMM5, \TMP4
  630. pshufd $78, \XMM5, \TMP6
  631. pxor \XMM5, \TMP6
  632. paddd ONE(%rip), \XMM0 # INCR CNT
  633. movdqa HashKey_4(%rsp), \TMP5
  634. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  635. movdqa \XMM0, \XMM1
  636. paddd ONE(%rip), \XMM0 # INCR CNT
  637. movdqa \XMM0, \XMM2
  638. paddd ONE(%rip), \XMM0 # INCR CNT
  639. movdqa \XMM0, \XMM3
  640. paddd ONE(%rip), \XMM0 # INCR CNT
  641. movdqa \XMM0, \XMM4
  642. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  643. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  644. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  645. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  646. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  647. pxor (%arg1), \XMM1
  648. pxor (%arg1), \XMM2
  649. pxor (%arg1), \XMM3
  650. pxor (%arg1), \XMM4
  651. movdqa HashKey_4_k(%rsp), \TMP5
  652. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  653. movaps 0x10(%arg1), \TMP1
  654. AESENC \TMP1, \XMM1 # Round 1
  655. AESENC \TMP1, \XMM2
  656. AESENC \TMP1, \XMM3
  657. AESENC \TMP1, \XMM4
  658. movaps 0x20(%arg1), \TMP1
  659. AESENC \TMP1, \XMM1 # Round 2
  660. AESENC \TMP1, \XMM2
  661. AESENC \TMP1, \XMM3
  662. AESENC \TMP1, \XMM4
  663. movdqa \XMM6, \TMP1
  664. pshufd $78, \XMM6, \TMP2
  665. pxor \XMM6, \TMP2
  666. movdqa HashKey_3(%rsp), \TMP5
  667. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  668. movaps 0x30(%arg1), \TMP3
  669. AESENC \TMP3, \XMM1 # Round 3
  670. AESENC \TMP3, \XMM2
  671. AESENC \TMP3, \XMM3
  672. AESENC \TMP3, \XMM4
  673. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  674. movaps 0x40(%arg1), \TMP3
  675. AESENC \TMP3, \XMM1 # Round 4
  676. AESENC \TMP3, \XMM2
  677. AESENC \TMP3, \XMM3
  678. AESENC \TMP3, \XMM4
  679. movdqa HashKey_3_k(%rsp), \TMP5
  680. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  681. movaps 0x50(%arg1), \TMP3
  682. AESENC \TMP3, \XMM1 # Round 5
  683. AESENC \TMP3, \XMM2
  684. AESENC \TMP3, \XMM3
  685. AESENC \TMP3, \XMM4
  686. pxor \TMP1, \TMP4
  687. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  688. pxor \XMM6, \XMM5
  689. pxor \TMP2, \TMP6
  690. movdqa \XMM7, \TMP1
  691. pshufd $78, \XMM7, \TMP2
  692. pxor \XMM7, \TMP2
  693. movdqa HashKey_2(%rsp ), \TMP5
  694. # Multiply TMP5 * HashKey using karatsuba
  695. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  696. movaps 0x60(%arg1), \TMP3
  697. AESENC \TMP3, \XMM1 # Round 6
  698. AESENC \TMP3, \XMM2
  699. AESENC \TMP3, \XMM3
  700. AESENC \TMP3, \XMM4
  701. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  702. movaps 0x70(%arg1), \TMP3
  703. AESENC \TMP3, \XMM1 # Round 7
  704. AESENC \TMP3, \XMM2
  705. AESENC \TMP3, \XMM3
  706. AESENC \TMP3, \XMM4
  707. movdqa HashKey_2_k(%rsp), \TMP5
  708. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  709. movaps 0x80(%arg1), \TMP3
  710. AESENC \TMP3, \XMM1 # Round 8
  711. AESENC \TMP3, \XMM2
  712. AESENC \TMP3, \XMM3
  713. AESENC \TMP3, \XMM4
  714. pxor \TMP1, \TMP4
  715. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  716. pxor \XMM7, \XMM5
  717. pxor \TMP2, \TMP6
  718. # Multiply XMM8 * HashKey
  719. # XMM8 and TMP5 hold the values for the two operands
  720. movdqa \XMM8, \TMP1
  721. pshufd $78, \XMM8, \TMP2
  722. pxor \XMM8, \TMP2
  723. movdqa HashKey(%rsp), \TMP5
  724. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  725. movaps 0x90(%arg1), \TMP3
  726. AESENC \TMP3, \XMM1 # Round 9
  727. AESENC \TMP3, \XMM2
  728. AESENC \TMP3, \XMM3
  729. AESENC \TMP3, \XMM4
  730. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  731. movaps 0xa0(%arg1), \TMP3
  732. AESENCLAST \TMP3, \XMM1 # Round 10
  733. AESENCLAST \TMP3, \XMM2
  734. AESENCLAST \TMP3, \XMM3
  735. AESENCLAST \TMP3, \XMM4
  736. movdqa HashKey_k(%rsp), \TMP5
  737. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  738. movdqu (%arg3,%r11,1), \TMP3
  739. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  740. movdqu 16(%arg3,%r11,1), \TMP3
  741. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  742. movdqu 32(%arg3,%r11,1), \TMP3
  743. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  744. movdqu 48(%arg3,%r11,1), \TMP3
  745. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  746. movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
  747. movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
  748. movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
  749. movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
  750. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  751. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  752. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  753. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  754. pxor \TMP4, \TMP1
  755. pxor \XMM8, \XMM5
  756. pxor \TMP6, \TMP2
  757. pxor \TMP1, \TMP2
  758. pxor \XMM5, \TMP2
  759. movdqa \TMP2, \TMP3
  760. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  761. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  762. pxor \TMP3, \XMM5
  763. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  764. # first phase of reduction
  765. movdqa \XMM5, \TMP2
  766. movdqa \XMM5, \TMP3
  767. movdqa \XMM5, \TMP4
  768. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  769. pslld $31, \TMP2 # packed right shift << 31
  770. pslld $30, \TMP3 # packed right shift << 30
  771. pslld $25, \TMP4 # packed right shift << 25
  772. pxor \TMP3, \TMP2 # xor the shifted versions
  773. pxor \TMP4, \TMP2
  774. movdqa \TMP2, \TMP5
  775. psrldq $4, \TMP5 # right shift T5 1 DW
  776. pslldq $12, \TMP2 # left shift T2 3 DWs
  777. pxor \TMP2, \XMM5
  778. # second phase of reduction
  779. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  780. movdqa \XMM5,\TMP3
  781. movdqa \XMM5,\TMP4
  782. psrld $1, \TMP2 # packed left shift >>1
  783. psrld $2, \TMP3 # packed left shift >>2
  784. psrld $7, \TMP4 # packed left shift >>7
  785. pxor \TMP3,\TMP2 # xor the shifted versions
  786. pxor \TMP4,\TMP2
  787. pxor \TMP5, \TMP2
  788. pxor \TMP2, \XMM5
  789. pxor \TMP1, \XMM5 # result is in TMP1
  790. pxor \XMM5, \XMM1
  791. .endm
  792. /*
  793. * decrypt 4 blocks at a time
  794. * ghash the 4 previously decrypted ciphertext blocks
  795. * arg1, %arg2, %arg3 are used as pointers only, not modified
  796. * %r11 is the data offset value
  797. */
  798. .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
  799. TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
  800. movdqa \XMM1, \XMM5
  801. movdqa \XMM2, \XMM6
  802. movdqa \XMM3, \XMM7
  803. movdqa \XMM4, \XMM8
  804. movdqa SHUF_MASK(%rip), %xmm15
  805. # multiply TMP5 * HashKey using karatsuba
  806. movdqa \XMM5, \TMP4
  807. pshufd $78, \XMM5, \TMP6
  808. pxor \XMM5, \TMP6
  809. paddd ONE(%rip), \XMM0 # INCR CNT
  810. movdqa HashKey_4(%rsp), \TMP5
  811. PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
  812. movdqa \XMM0, \XMM1
  813. paddd ONE(%rip), \XMM0 # INCR CNT
  814. movdqa \XMM0, \XMM2
  815. paddd ONE(%rip), \XMM0 # INCR CNT
  816. movdqa \XMM0, \XMM3
  817. paddd ONE(%rip), \XMM0 # INCR CNT
  818. movdqa \XMM0, \XMM4
  819. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  820. PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
  821. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  822. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  823. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  824. pxor (%arg1), \XMM1
  825. pxor (%arg1), \XMM2
  826. pxor (%arg1), \XMM3
  827. pxor (%arg1), \XMM4
  828. movdqa HashKey_4_k(%rsp), \TMP5
  829. PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
  830. movaps 0x10(%arg1), \TMP1
  831. AESENC \TMP1, \XMM1 # Round 1
  832. AESENC \TMP1, \XMM2
  833. AESENC \TMP1, \XMM3
  834. AESENC \TMP1, \XMM4
  835. movaps 0x20(%arg1), \TMP1
  836. AESENC \TMP1, \XMM1 # Round 2
  837. AESENC \TMP1, \XMM2
  838. AESENC \TMP1, \XMM3
  839. AESENC \TMP1, \XMM4
  840. movdqa \XMM6, \TMP1
  841. pshufd $78, \XMM6, \TMP2
  842. pxor \XMM6, \TMP2
  843. movdqa HashKey_3(%rsp), \TMP5
  844. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
  845. movaps 0x30(%arg1), \TMP3
  846. AESENC \TMP3, \XMM1 # Round 3
  847. AESENC \TMP3, \XMM2
  848. AESENC \TMP3, \XMM3
  849. AESENC \TMP3, \XMM4
  850. PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
  851. movaps 0x40(%arg1), \TMP3
  852. AESENC \TMP3, \XMM1 # Round 4
  853. AESENC \TMP3, \XMM2
  854. AESENC \TMP3, \XMM3
  855. AESENC \TMP3, \XMM4
  856. movdqa HashKey_3_k(%rsp), \TMP5
  857. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  858. movaps 0x50(%arg1), \TMP3
  859. AESENC \TMP3, \XMM1 # Round 5
  860. AESENC \TMP3, \XMM2
  861. AESENC \TMP3, \XMM3
  862. AESENC \TMP3, \XMM4
  863. pxor \TMP1, \TMP4
  864. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  865. pxor \XMM6, \XMM5
  866. pxor \TMP2, \TMP6
  867. movdqa \XMM7, \TMP1
  868. pshufd $78, \XMM7, \TMP2
  869. pxor \XMM7, \TMP2
  870. movdqa HashKey_2(%rsp ), \TMP5
  871. # Multiply TMP5 * HashKey using karatsuba
  872. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  873. movaps 0x60(%arg1), \TMP3
  874. AESENC \TMP3, \XMM1 # Round 6
  875. AESENC \TMP3, \XMM2
  876. AESENC \TMP3, \XMM3
  877. AESENC \TMP3, \XMM4
  878. PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
  879. movaps 0x70(%arg1), \TMP3
  880. AESENC \TMP3, \XMM1 # Round 7
  881. AESENC \TMP3, \XMM2
  882. AESENC \TMP3, \XMM3
  883. AESENC \TMP3, \XMM4
  884. movdqa HashKey_2_k(%rsp), \TMP5
  885. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  886. movaps 0x80(%arg1), \TMP3
  887. AESENC \TMP3, \XMM1 # Round 8
  888. AESENC \TMP3, \XMM2
  889. AESENC \TMP3, \XMM3
  890. AESENC \TMP3, \XMM4
  891. pxor \TMP1, \TMP4
  892. # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
  893. pxor \XMM7, \XMM5
  894. pxor \TMP2, \TMP6
  895. # Multiply XMM8 * HashKey
  896. # XMM8 and TMP5 hold the values for the two operands
  897. movdqa \XMM8, \TMP1
  898. pshufd $78, \XMM8, \TMP2
  899. pxor \XMM8, \TMP2
  900. movdqa HashKey(%rsp), \TMP5
  901. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  902. movaps 0x90(%arg1), \TMP3
  903. AESENC \TMP3, \XMM1 # Round 9
  904. AESENC \TMP3, \XMM2
  905. AESENC \TMP3, \XMM3
  906. AESENC \TMP3, \XMM4
  907. PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
  908. movaps 0xa0(%arg1), \TMP3
  909. AESENCLAST \TMP3, \XMM1 # Round 10
  910. AESENCLAST \TMP3, \XMM2
  911. AESENCLAST \TMP3, \XMM3
  912. AESENCLAST \TMP3, \XMM4
  913. movdqa HashKey_k(%rsp), \TMP5
  914. PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  915. movdqu (%arg3,%r11,1), \TMP3
  916. pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
  917. movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
  918. movdqa \TMP3, \XMM1
  919. movdqu 16(%arg3,%r11,1), \TMP3
  920. pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
  921. movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
  922. movdqa \TMP3, \XMM2
  923. movdqu 32(%arg3,%r11,1), \TMP3
  924. pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
  925. movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
  926. movdqa \TMP3, \XMM3
  927. movdqu 48(%arg3,%r11,1), \TMP3
  928. pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
  929. movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
  930. movdqa \TMP3, \XMM4
  931. PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
  932. PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
  933. PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
  934. PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
  935. pxor \TMP4, \TMP1
  936. pxor \XMM8, \XMM5
  937. pxor \TMP6, \TMP2
  938. pxor \TMP1, \TMP2
  939. pxor \XMM5, \TMP2
  940. movdqa \TMP2, \TMP3
  941. pslldq $8, \TMP3 # left shift TMP3 2 DWs
  942. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  943. pxor \TMP3, \XMM5
  944. pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
  945. # first phase of reduction
  946. movdqa \XMM5, \TMP2
  947. movdqa \XMM5, \TMP3
  948. movdqa \XMM5, \TMP4
  949. # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
  950. pslld $31, \TMP2 # packed right shift << 31
  951. pslld $30, \TMP3 # packed right shift << 30
  952. pslld $25, \TMP4 # packed right shift << 25
  953. pxor \TMP3, \TMP2 # xor the shifted versions
  954. pxor \TMP4, \TMP2
  955. movdqa \TMP2, \TMP5
  956. psrldq $4, \TMP5 # right shift T5 1 DW
  957. pslldq $12, \TMP2 # left shift T2 3 DWs
  958. pxor \TMP2, \XMM5
  959. # second phase of reduction
  960. movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
  961. movdqa \XMM5,\TMP3
  962. movdqa \XMM5,\TMP4
  963. psrld $1, \TMP2 # packed left shift >>1
  964. psrld $2, \TMP3 # packed left shift >>2
  965. psrld $7, \TMP4 # packed left shift >>7
  966. pxor \TMP3,\TMP2 # xor the shifted versions
  967. pxor \TMP4,\TMP2
  968. pxor \TMP5, \TMP2
  969. pxor \TMP2, \XMM5
  970. pxor \TMP1, \XMM5 # result is in TMP1
  971. pxor \XMM5, \XMM1
  972. .endm
  973. /* GHASH the last 4 ciphertext blocks. */
  974. .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
  975. TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
  976. # Multiply TMP6 * HashKey (using Karatsuba)
  977. movdqa \XMM1, \TMP6
  978. pshufd $78, \XMM1, \TMP2
  979. pxor \XMM1, \TMP2
  980. movdqa HashKey_4(%rsp), \TMP5
  981. PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
  982. PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
  983. movdqa HashKey_4_k(%rsp), \TMP4
  984. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  985. movdqa \XMM1, \XMMDst
  986. movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
  987. # Multiply TMP1 * HashKey (using Karatsuba)
  988. movdqa \XMM2, \TMP1
  989. pshufd $78, \XMM2, \TMP2
  990. pxor \XMM2, \TMP2
  991. movdqa HashKey_3(%rsp), \TMP5
  992. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  993. PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
  994. movdqa HashKey_3_k(%rsp), \TMP4
  995. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  996. pxor \TMP1, \TMP6
  997. pxor \XMM2, \XMMDst
  998. pxor \TMP2, \XMM1
  999. # results accumulated in TMP6, XMMDst, XMM1
  1000. # Multiply TMP1 * HashKey (using Karatsuba)
  1001. movdqa \XMM3, \TMP1
  1002. pshufd $78, \XMM3, \TMP2
  1003. pxor \XMM3, \TMP2
  1004. movdqa HashKey_2(%rsp), \TMP5
  1005. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1006. PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
  1007. movdqa HashKey_2_k(%rsp), \TMP4
  1008. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1009. pxor \TMP1, \TMP6
  1010. pxor \XMM3, \XMMDst
  1011. pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
  1012. # Multiply TMP1 * HashKey (using Karatsuba)
  1013. movdqa \XMM4, \TMP1
  1014. pshufd $78, \XMM4, \TMP2
  1015. pxor \XMM4, \TMP2
  1016. movdqa HashKey(%rsp), \TMP5
  1017. PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
  1018. PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
  1019. movdqa HashKey_k(%rsp), \TMP4
  1020. PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
  1021. pxor \TMP1, \TMP6
  1022. pxor \XMM4, \XMMDst
  1023. pxor \XMM1, \TMP2
  1024. pxor \TMP6, \TMP2
  1025. pxor \XMMDst, \TMP2
  1026. # middle section of the temp results combined as in karatsuba algorithm
  1027. movdqa \TMP2, \TMP4
  1028. pslldq $8, \TMP4 # left shift TMP4 2 DWs
  1029. psrldq $8, \TMP2 # right shift TMP2 2 DWs
  1030. pxor \TMP4, \XMMDst
  1031. pxor \TMP2, \TMP6
  1032. # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
  1033. # first phase of the reduction
  1034. movdqa \XMMDst, \TMP2
  1035. movdqa \XMMDst, \TMP3
  1036. movdqa \XMMDst, \TMP4
  1037. # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
  1038. pslld $31, \TMP2 # packed right shifting << 31
  1039. pslld $30, \TMP3 # packed right shifting << 30
  1040. pslld $25, \TMP4 # packed right shifting << 25
  1041. pxor \TMP3, \TMP2 # xor the shifted versions
  1042. pxor \TMP4, \TMP2
  1043. movdqa \TMP2, \TMP7
  1044. psrldq $4, \TMP7 # right shift TMP7 1 DW
  1045. pslldq $12, \TMP2 # left shift TMP2 3 DWs
  1046. pxor \TMP2, \XMMDst
  1047. # second phase of the reduction
  1048. movdqa \XMMDst, \TMP2
  1049. # make 3 copies of XMMDst for doing 3 shift operations
  1050. movdqa \XMMDst, \TMP3
  1051. movdqa \XMMDst, \TMP4
  1052. psrld $1, \TMP2 # packed left shift >> 1
  1053. psrld $2, \TMP3 # packed left shift >> 2
  1054. psrld $7, \TMP4 # packed left shift >> 7
  1055. pxor \TMP3, \TMP2 # xor the shifted versions
  1056. pxor \TMP4, \TMP2
  1057. pxor \TMP7, \TMP2
  1058. pxor \TMP2, \XMMDst
  1059. pxor \TMP6, \XMMDst # reduced result is in XMMDst
  1060. .endm
  1061. /* Encryption of a single block done*/
  1062. .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
  1063. pxor (%arg1), \XMM0
  1064. movaps 16(%arg1), \TMP1
  1065. AESENC \TMP1, \XMM0
  1066. movaps 32(%arg1), \TMP1
  1067. AESENC \TMP1, \XMM0
  1068. movaps 48(%arg1), \TMP1
  1069. AESENC \TMP1, \XMM0
  1070. movaps 64(%arg1), \TMP1
  1071. AESENC \TMP1, \XMM0
  1072. movaps 80(%arg1), \TMP1
  1073. AESENC \TMP1, \XMM0
  1074. movaps 96(%arg1), \TMP1
  1075. AESENC \TMP1, \XMM0
  1076. movaps 112(%arg1), \TMP1
  1077. AESENC \TMP1, \XMM0
  1078. movaps 128(%arg1), \TMP1
  1079. AESENC \TMP1, \XMM0
  1080. movaps 144(%arg1), \TMP1
  1081. AESENC \TMP1, \XMM0
  1082. movaps 160(%arg1), \TMP1
  1083. AESENCLAST \TMP1, \XMM0
  1084. .endm
  1085. /*****************************************************************************
  1086. * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1087. * u8 *out, // Plaintext output. Encrypt in-place is allowed.
  1088. * const u8 *in, // Ciphertext input
  1089. * u64 plaintext_len, // Length of data in bytes for decryption.
  1090. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1091. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1092. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1093. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1094. * const u8 *aad, // Additional Authentication Data (AAD)
  1095. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1096. * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
  1097. * // given authentication tag and only return the plaintext if they match.
  1098. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
  1099. * // (most likely), 12 or 8.
  1100. *
  1101. * Assumptions:
  1102. *
  1103. * keys:
  1104. * keys are pre-expanded and aligned to 16 bytes. we are using the first
  1105. * set of 11 keys in the data structure void *aes_ctx
  1106. *
  1107. * iv:
  1108. * 0 1 2 3
  1109. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1110. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1111. * | Salt (From the SA) |
  1112. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1113. * | Initialization Vector |
  1114. * | (This is the sequence number from IPSec header) |
  1115. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1116. * | 0x1 |
  1117. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1118. *
  1119. *
  1120. *
  1121. * AAD:
  1122. * AAD padded to 128 bits with 0
  1123. * for example, assume AAD is a u32 vector
  1124. *
  1125. * if AAD is 8 bytes:
  1126. * AAD[3] = {A0, A1};
  1127. * padded AAD in xmm register = {A1 A0 0 0}
  1128. *
  1129. * 0 1 2 3
  1130. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1131. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1132. * | SPI (A1) |
  1133. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1134. * | 32-bit Sequence Number (A0) |
  1135. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1136. * | 0x0 |
  1137. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1138. *
  1139. * AAD Format with 32-bit Sequence Number
  1140. *
  1141. * if AAD is 12 bytes:
  1142. * AAD[3] = {A0, A1, A2};
  1143. * padded AAD in xmm register = {A2 A1 A0 0}
  1144. *
  1145. * 0 1 2 3
  1146. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1147. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1148. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1149. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1150. * | SPI (A2) |
  1151. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1152. * | 64-bit Extended Sequence Number {A1,A0} |
  1153. * | |
  1154. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1155. * | 0x0 |
  1156. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1157. *
  1158. * AAD Format with 64-bit Extended Sequence Number
  1159. *
  1160. * aadLen:
  1161. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1162. * The code supports 16 too but for other sizes, the code will fail.
  1163. *
  1164. * TLen:
  1165. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1166. * For other sizes, the code will fail.
  1167. *
  1168. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1169. *
  1170. *****************************************************************************/
  1171. ENTRY(aesni_gcm_dec)
  1172. push %r12
  1173. push %r13
  1174. push %r14
  1175. mov %rsp, %r14
  1176. /*
  1177. * states of %xmm registers %xmm6:%xmm15 not saved
  1178. * all %xmm registers are clobbered
  1179. */
  1180. sub $VARIABLE_OFFSET, %rsp
  1181. and $~63, %rsp # align rsp to 64 bytes
  1182. mov %arg6, %r12
  1183. movdqu (%r12), %xmm13 # %xmm13 = HashKey
  1184. movdqa SHUF_MASK(%rip), %xmm2
  1185. PSHUFB_XMM %xmm2, %xmm13
  1186. # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
  1187. movdqa %xmm13, %xmm2
  1188. psllq $1, %xmm13
  1189. psrlq $63, %xmm2
  1190. movdqa %xmm2, %xmm1
  1191. pslldq $8, %xmm2
  1192. psrldq $8, %xmm1
  1193. por %xmm2, %xmm13
  1194. # Reduction
  1195. pshufd $0x24, %xmm1, %xmm2
  1196. pcmpeqd TWOONE(%rip), %xmm2
  1197. pand POLY(%rip), %xmm2
  1198. pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
  1199. # Decrypt first few blocks
  1200. movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
  1201. mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
  1202. and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
  1203. mov %r13, %r12
  1204. and $(3<<4), %r12
  1205. jz _initial_num_blocks_is_0_decrypt
  1206. cmp $(2<<4), %r12
  1207. jb _initial_num_blocks_is_1_decrypt
  1208. je _initial_num_blocks_is_2_decrypt
  1209. _initial_num_blocks_is_3_decrypt:
  1210. INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1211. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
  1212. sub $48, %r13
  1213. jmp _initial_blocks_decrypted
  1214. _initial_num_blocks_is_2_decrypt:
  1215. INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1216. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
  1217. sub $32, %r13
  1218. jmp _initial_blocks_decrypted
  1219. _initial_num_blocks_is_1_decrypt:
  1220. INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1221. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
  1222. sub $16, %r13
  1223. jmp _initial_blocks_decrypted
  1224. _initial_num_blocks_is_0_decrypt:
  1225. INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1226. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
  1227. _initial_blocks_decrypted:
  1228. cmp $0, %r13
  1229. je _zero_cipher_left_decrypt
  1230. sub $64, %r13
  1231. je _four_cipher_left_decrypt
  1232. _decrypt_by_4:
  1233. GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1234. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
  1235. add $64, %r11
  1236. sub $64, %r13
  1237. jne _decrypt_by_4
  1238. _four_cipher_left_decrypt:
  1239. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1240. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1241. _zero_cipher_left_decrypt:
  1242. mov %arg4, %r13
  1243. and $15, %r13 # %r13 = arg4 (mod 16)
  1244. je _multiple_of_16_bytes_decrypt
  1245. # Handle the last <16 byte block separately
  1246. paddd ONE(%rip), %xmm0 # increment CNT to get Yn
  1247. movdqa SHUF_MASK(%rip), %xmm10
  1248. PSHUFB_XMM %xmm10, %xmm0
  1249. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
  1250. sub $16, %r11
  1251. add %r13, %r11
  1252. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
  1253. lea SHIFT_MASK+16(%rip), %r12
  1254. sub %r13, %r12
  1255. # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
  1256. # (%r13 is the number of bytes in plaintext mod 16)
  1257. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1258. PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
  1259. movdqa %xmm1, %xmm2
  1260. pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
  1261. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1262. # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
  1263. pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
  1264. pand %xmm1, %xmm2
  1265. movdqa SHUF_MASK(%rip), %xmm10
  1266. PSHUFB_XMM %xmm10 ,%xmm2
  1267. pxor %xmm2, %xmm8
  1268. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1269. # GHASH computation for the last <16 byte block
  1270. sub %r13, %r11
  1271. add $16, %r11
  1272. # output %r13 bytes
  1273. MOVQ_R64_XMM %xmm0, %rax
  1274. cmp $8, %r13
  1275. jle _less_than_8_bytes_left_decrypt
  1276. mov %rax, (%arg2 , %r11, 1)
  1277. add $8, %r11
  1278. psrldq $8, %xmm0
  1279. MOVQ_R64_XMM %xmm0, %rax
  1280. sub $8, %r13
  1281. _less_than_8_bytes_left_decrypt:
  1282. mov %al, (%arg2, %r11, 1)
  1283. add $1, %r11
  1284. shr $8, %rax
  1285. sub $1, %r13
  1286. jne _less_than_8_bytes_left_decrypt
  1287. _multiple_of_16_bytes_decrypt:
  1288. mov arg8, %r12 # %r13 = aadLen (number of bytes)
  1289. shl $3, %r12 # convert into number of bits
  1290. movd %r12d, %xmm15 # len(A) in %xmm15
  1291. shl $3, %arg4 # len(C) in bits (*128)
  1292. MOVQ_R64_XMM %arg4, %xmm1
  1293. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1294. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1295. pxor %xmm15, %xmm8
  1296. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1297. # final GHASH computation
  1298. movdqa SHUF_MASK(%rip), %xmm10
  1299. PSHUFB_XMM %xmm10, %xmm8
  1300. mov %arg5, %rax # %rax = *Y0
  1301. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1302. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
  1303. pxor %xmm8, %xmm0
  1304. _return_T_decrypt:
  1305. mov arg9, %r10 # %r10 = authTag
  1306. mov arg10, %r11 # %r11 = auth_tag_len
  1307. cmp $16, %r11
  1308. je _T_16_decrypt
  1309. cmp $12, %r11
  1310. je _T_12_decrypt
  1311. _T_8_decrypt:
  1312. MOVQ_R64_XMM %xmm0, %rax
  1313. mov %rax, (%r10)
  1314. jmp _return_T_done_decrypt
  1315. _T_12_decrypt:
  1316. MOVQ_R64_XMM %xmm0, %rax
  1317. mov %rax, (%r10)
  1318. psrldq $8, %xmm0
  1319. movd %xmm0, %eax
  1320. mov %eax, 8(%r10)
  1321. jmp _return_T_done_decrypt
  1322. _T_16_decrypt:
  1323. movdqu %xmm0, (%r10)
  1324. _return_T_done_decrypt:
  1325. mov %r14, %rsp
  1326. pop %r14
  1327. pop %r13
  1328. pop %r12
  1329. ret
  1330. ENDPROC(aesni_gcm_dec)
  1331. /*****************************************************************************
  1332. * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
  1333. * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
  1334. * const u8 *in, // Plaintext input
  1335. * u64 plaintext_len, // Length of data in bytes for encryption.
  1336. * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
  1337. * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
  1338. * // concatenated with 0x00000001. 16-byte aligned pointer.
  1339. * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
  1340. * const u8 *aad, // Additional Authentication Data (AAD)
  1341. * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
  1342. * u8 *auth_tag, // Authenticated Tag output.
  1343. * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
  1344. * // 12 or 8.
  1345. *
  1346. * Assumptions:
  1347. *
  1348. * keys:
  1349. * keys are pre-expanded and aligned to 16 bytes. we are using the
  1350. * first set of 11 keys in the data structure void *aes_ctx
  1351. *
  1352. *
  1353. * iv:
  1354. * 0 1 2 3
  1355. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1356. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1357. * | Salt (From the SA) |
  1358. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1359. * | Initialization Vector |
  1360. * | (This is the sequence number from IPSec header) |
  1361. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1362. * | 0x1 |
  1363. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1364. *
  1365. *
  1366. *
  1367. * AAD:
  1368. * AAD padded to 128 bits with 0
  1369. * for example, assume AAD is a u32 vector
  1370. *
  1371. * if AAD is 8 bytes:
  1372. * AAD[3] = {A0, A1};
  1373. * padded AAD in xmm register = {A1 A0 0 0}
  1374. *
  1375. * 0 1 2 3
  1376. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1377. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1378. * | SPI (A1) |
  1379. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1380. * | 32-bit Sequence Number (A0) |
  1381. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1382. * | 0x0 |
  1383. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1384. *
  1385. * AAD Format with 32-bit Sequence Number
  1386. *
  1387. * if AAD is 12 bytes:
  1388. * AAD[3] = {A0, A1, A2};
  1389. * padded AAD in xmm register = {A2 A1 A0 0}
  1390. *
  1391. * 0 1 2 3
  1392. * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  1393. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1394. * | SPI (A2) |
  1395. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1396. * | 64-bit Extended Sequence Number {A1,A0} |
  1397. * | |
  1398. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1399. * | 0x0 |
  1400. * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  1401. *
  1402. * AAD Format with 64-bit Extended Sequence Number
  1403. *
  1404. * aadLen:
  1405. * from the definition of the spec, aadLen can only be 8 or 12 bytes.
  1406. * The code supports 16 too but for other sizes, the code will fail.
  1407. *
  1408. * TLen:
  1409. * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
  1410. * For other sizes, the code will fail.
  1411. *
  1412. * poly = x^128 + x^127 + x^126 + x^121 + 1
  1413. ***************************************************************************/
  1414. ENTRY(aesni_gcm_enc)
  1415. push %r12
  1416. push %r13
  1417. push %r14
  1418. mov %rsp, %r14
  1419. #
  1420. # states of %xmm registers %xmm6:%xmm15 not saved
  1421. # all %xmm registers are clobbered
  1422. #
  1423. sub $VARIABLE_OFFSET, %rsp
  1424. and $~63, %rsp
  1425. mov %arg6, %r12
  1426. movdqu (%r12), %xmm13
  1427. movdqa SHUF_MASK(%rip), %xmm2
  1428. PSHUFB_XMM %xmm2, %xmm13
  1429. # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
  1430. movdqa %xmm13, %xmm2
  1431. psllq $1, %xmm13
  1432. psrlq $63, %xmm2
  1433. movdqa %xmm2, %xmm1
  1434. pslldq $8, %xmm2
  1435. psrldq $8, %xmm1
  1436. por %xmm2, %xmm13
  1437. # reduce HashKey<<1
  1438. pshufd $0x24, %xmm1, %xmm2
  1439. pcmpeqd TWOONE(%rip), %xmm2
  1440. pand POLY(%rip), %xmm2
  1441. pxor %xmm2, %xmm13
  1442. movdqa %xmm13, HashKey(%rsp)
  1443. mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
  1444. and $-16, %r13
  1445. mov %r13, %r12
  1446. # Encrypt first few blocks
  1447. and $(3<<4), %r12
  1448. jz _initial_num_blocks_is_0_encrypt
  1449. cmp $(2<<4), %r12
  1450. jb _initial_num_blocks_is_1_encrypt
  1451. je _initial_num_blocks_is_2_encrypt
  1452. _initial_num_blocks_is_3_encrypt:
  1453. INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1454. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
  1455. sub $48, %r13
  1456. jmp _initial_blocks_encrypted
  1457. _initial_num_blocks_is_2_encrypt:
  1458. INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1459. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
  1460. sub $32, %r13
  1461. jmp _initial_blocks_encrypted
  1462. _initial_num_blocks_is_1_encrypt:
  1463. INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1464. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
  1465. sub $16, %r13
  1466. jmp _initial_blocks_encrypted
  1467. _initial_num_blocks_is_0_encrypt:
  1468. INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
  1469. %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
  1470. _initial_blocks_encrypted:
  1471. # Main loop - Encrypt remaining blocks
  1472. cmp $0, %r13
  1473. je _zero_cipher_left_encrypt
  1474. sub $64, %r13
  1475. je _four_cipher_left_encrypt
  1476. _encrypt_by_4_encrypt:
  1477. GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
  1478. %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
  1479. add $64, %r11
  1480. sub $64, %r13
  1481. jne _encrypt_by_4_encrypt
  1482. _four_cipher_left_encrypt:
  1483. GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
  1484. %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
  1485. _zero_cipher_left_encrypt:
  1486. mov %arg4, %r13
  1487. and $15, %r13 # %r13 = arg4 (mod 16)
  1488. je _multiple_of_16_bytes_encrypt
  1489. # Handle the last <16 Byte block separately
  1490. paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
  1491. movdqa SHUF_MASK(%rip), %xmm10
  1492. PSHUFB_XMM %xmm10, %xmm0
  1493. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
  1494. sub $16, %r11
  1495. add %r13, %r11
  1496. movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
  1497. lea SHIFT_MASK+16(%rip), %r12
  1498. sub %r13, %r12
  1499. # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
  1500. # (%r13 is the number of bytes in plaintext mod 16)
  1501. movdqu (%r12), %xmm2 # get the appropriate shuffle mask
  1502. PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
  1503. pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
  1504. movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
  1505. # get the appropriate mask to mask out top 16-r13 bytes of xmm0
  1506. pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
  1507. movdqa SHUF_MASK(%rip), %xmm10
  1508. PSHUFB_XMM %xmm10,%xmm0
  1509. pxor %xmm0, %xmm8
  1510. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1511. # GHASH computation for the last <16 byte block
  1512. sub %r13, %r11
  1513. add $16, %r11
  1514. movdqa SHUF_MASK(%rip), %xmm10
  1515. PSHUFB_XMM %xmm10, %xmm0
  1516. # shuffle xmm0 back to output as ciphertext
  1517. # Output %r13 bytes
  1518. MOVQ_R64_XMM %xmm0, %rax
  1519. cmp $8, %r13
  1520. jle _less_than_8_bytes_left_encrypt
  1521. mov %rax, (%arg2 , %r11, 1)
  1522. add $8, %r11
  1523. psrldq $8, %xmm0
  1524. MOVQ_R64_XMM %xmm0, %rax
  1525. sub $8, %r13
  1526. _less_than_8_bytes_left_encrypt:
  1527. mov %al, (%arg2, %r11, 1)
  1528. add $1, %r11
  1529. shr $8, %rax
  1530. sub $1, %r13
  1531. jne _less_than_8_bytes_left_encrypt
  1532. _multiple_of_16_bytes_encrypt:
  1533. mov arg8, %r12 # %r12 = addLen (number of bytes)
  1534. shl $3, %r12
  1535. movd %r12d, %xmm15 # len(A) in %xmm15
  1536. shl $3, %arg4 # len(C) in bits (*128)
  1537. MOVQ_R64_XMM %arg4, %xmm1
  1538. pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
  1539. pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
  1540. pxor %xmm15, %xmm8
  1541. GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
  1542. # final GHASH computation
  1543. movdqa SHUF_MASK(%rip), %xmm10
  1544. PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
  1545. mov %arg5, %rax # %rax = *Y0
  1546. movdqu (%rax), %xmm0 # %xmm0 = Y0
  1547. ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
  1548. pxor %xmm8, %xmm0
  1549. _return_T_encrypt:
  1550. mov arg9, %r10 # %r10 = authTag
  1551. mov arg10, %r11 # %r11 = auth_tag_len
  1552. cmp $16, %r11
  1553. je _T_16_encrypt
  1554. cmp $12, %r11
  1555. je _T_12_encrypt
  1556. _T_8_encrypt:
  1557. MOVQ_R64_XMM %xmm0, %rax
  1558. mov %rax, (%r10)
  1559. jmp _return_T_done_encrypt
  1560. _T_12_encrypt:
  1561. MOVQ_R64_XMM %xmm0, %rax
  1562. mov %rax, (%r10)
  1563. psrldq $8, %xmm0
  1564. movd %xmm0, %eax
  1565. mov %eax, 8(%r10)
  1566. jmp _return_T_done_encrypt
  1567. _T_16_encrypt:
  1568. movdqu %xmm0, (%r10)
  1569. _return_T_done_encrypt:
  1570. mov %r14, %rsp
  1571. pop %r14
  1572. pop %r13
  1573. pop %r12
  1574. ret
  1575. ENDPROC(aesni_gcm_enc)
  1576. #endif
  1577. .align 4
  1578. _key_expansion_128:
  1579. _key_expansion_256a:
  1580. pshufd $0b11111111, %xmm1, %xmm1
  1581. shufps $0b00010000, %xmm0, %xmm4
  1582. pxor %xmm4, %xmm0
  1583. shufps $0b10001100, %xmm0, %xmm4
  1584. pxor %xmm4, %xmm0
  1585. pxor %xmm1, %xmm0
  1586. movaps %xmm0, (TKEYP)
  1587. add $0x10, TKEYP
  1588. ret
  1589. ENDPROC(_key_expansion_128)
  1590. ENDPROC(_key_expansion_256a)
  1591. .align 4
  1592. _key_expansion_192a:
  1593. pshufd $0b01010101, %xmm1, %xmm1
  1594. shufps $0b00010000, %xmm0, %xmm4
  1595. pxor %xmm4, %xmm0
  1596. shufps $0b10001100, %xmm0, %xmm4
  1597. pxor %xmm4, %xmm0
  1598. pxor %xmm1, %xmm0
  1599. movaps %xmm2, %xmm5
  1600. movaps %xmm2, %xmm6
  1601. pslldq $4, %xmm5
  1602. pshufd $0b11111111, %xmm0, %xmm3
  1603. pxor %xmm3, %xmm2
  1604. pxor %xmm5, %xmm2
  1605. movaps %xmm0, %xmm1
  1606. shufps $0b01000100, %xmm0, %xmm6
  1607. movaps %xmm6, (TKEYP)
  1608. shufps $0b01001110, %xmm2, %xmm1
  1609. movaps %xmm1, 0x10(TKEYP)
  1610. add $0x20, TKEYP
  1611. ret
  1612. ENDPROC(_key_expansion_192a)
  1613. .align 4
  1614. _key_expansion_192b:
  1615. pshufd $0b01010101, %xmm1, %xmm1
  1616. shufps $0b00010000, %xmm0, %xmm4
  1617. pxor %xmm4, %xmm0
  1618. shufps $0b10001100, %xmm0, %xmm4
  1619. pxor %xmm4, %xmm0
  1620. pxor %xmm1, %xmm0
  1621. movaps %xmm2, %xmm5
  1622. pslldq $4, %xmm5
  1623. pshufd $0b11111111, %xmm0, %xmm3
  1624. pxor %xmm3, %xmm2
  1625. pxor %xmm5, %xmm2
  1626. movaps %xmm0, (TKEYP)
  1627. add $0x10, TKEYP
  1628. ret
  1629. ENDPROC(_key_expansion_192b)
  1630. .align 4
  1631. _key_expansion_256b:
  1632. pshufd $0b10101010, %xmm1, %xmm1
  1633. shufps $0b00010000, %xmm2, %xmm4
  1634. pxor %xmm4, %xmm2
  1635. shufps $0b10001100, %xmm2, %xmm4
  1636. pxor %xmm4, %xmm2
  1637. pxor %xmm1, %xmm2
  1638. movaps %xmm2, (TKEYP)
  1639. add $0x10, TKEYP
  1640. ret
  1641. ENDPROC(_key_expansion_256b)
  1642. /*
  1643. * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
  1644. * unsigned int key_len)
  1645. */
  1646. ENTRY(aesni_set_key)
  1647. #ifndef __x86_64__
  1648. pushl KEYP
  1649. movl 8(%esp), KEYP # ctx
  1650. movl 12(%esp), UKEYP # in_key
  1651. movl 16(%esp), %edx # key_len
  1652. #endif
  1653. movups (UKEYP), %xmm0 # user key (first 16 bytes)
  1654. movaps %xmm0, (KEYP)
  1655. lea 0x10(KEYP), TKEYP # key addr
  1656. movl %edx, 480(KEYP)
  1657. pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
  1658. cmp $24, %dl
  1659. jb .Lenc_key128
  1660. je .Lenc_key192
  1661. movups 0x10(UKEYP), %xmm2 # other user key
  1662. movaps %xmm2, (TKEYP)
  1663. add $0x10, TKEYP
  1664. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1665. call _key_expansion_256a
  1666. AESKEYGENASSIST 0x1 %xmm0 %xmm1
  1667. call _key_expansion_256b
  1668. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1669. call _key_expansion_256a
  1670. AESKEYGENASSIST 0x2 %xmm0 %xmm1
  1671. call _key_expansion_256b
  1672. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1673. call _key_expansion_256a
  1674. AESKEYGENASSIST 0x4 %xmm0 %xmm1
  1675. call _key_expansion_256b
  1676. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1677. call _key_expansion_256a
  1678. AESKEYGENASSIST 0x8 %xmm0 %xmm1
  1679. call _key_expansion_256b
  1680. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1681. call _key_expansion_256a
  1682. AESKEYGENASSIST 0x10 %xmm0 %xmm1
  1683. call _key_expansion_256b
  1684. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1685. call _key_expansion_256a
  1686. AESKEYGENASSIST 0x20 %xmm0 %xmm1
  1687. call _key_expansion_256b
  1688. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1689. call _key_expansion_256a
  1690. jmp .Ldec_key
  1691. .Lenc_key192:
  1692. movq 0x10(UKEYP), %xmm2 # other user key
  1693. AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
  1694. call _key_expansion_192a
  1695. AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
  1696. call _key_expansion_192b
  1697. AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
  1698. call _key_expansion_192a
  1699. AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
  1700. call _key_expansion_192b
  1701. AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
  1702. call _key_expansion_192a
  1703. AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
  1704. call _key_expansion_192b
  1705. AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
  1706. call _key_expansion_192a
  1707. AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
  1708. call _key_expansion_192b
  1709. jmp .Ldec_key
  1710. .Lenc_key128:
  1711. AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
  1712. call _key_expansion_128
  1713. AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
  1714. call _key_expansion_128
  1715. AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
  1716. call _key_expansion_128
  1717. AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
  1718. call _key_expansion_128
  1719. AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
  1720. call _key_expansion_128
  1721. AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
  1722. call _key_expansion_128
  1723. AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
  1724. call _key_expansion_128
  1725. AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
  1726. call _key_expansion_128
  1727. AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
  1728. call _key_expansion_128
  1729. AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
  1730. call _key_expansion_128
  1731. .Ldec_key:
  1732. sub $0x10, TKEYP
  1733. movaps (KEYP), %xmm0
  1734. movaps (TKEYP), %xmm1
  1735. movaps %xmm0, 240(TKEYP)
  1736. movaps %xmm1, 240(KEYP)
  1737. add $0x10, KEYP
  1738. lea 240-16(TKEYP), UKEYP
  1739. .align 4
  1740. .Ldec_key_loop:
  1741. movaps (KEYP), %xmm0
  1742. AESIMC %xmm0 %xmm1
  1743. movaps %xmm1, (UKEYP)
  1744. add $0x10, KEYP
  1745. sub $0x10, UKEYP
  1746. cmp TKEYP, KEYP
  1747. jb .Ldec_key_loop
  1748. xor AREG, AREG
  1749. #ifndef __x86_64__
  1750. popl KEYP
  1751. #endif
  1752. ret
  1753. ENDPROC(aesni_set_key)
  1754. /*
  1755. * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1756. */
  1757. ENTRY(aesni_enc)
  1758. #ifndef __x86_64__
  1759. pushl KEYP
  1760. pushl KLEN
  1761. movl 12(%esp), KEYP
  1762. movl 16(%esp), OUTP
  1763. movl 20(%esp), INP
  1764. #endif
  1765. movl 480(KEYP), KLEN # key length
  1766. movups (INP), STATE # input
  1767. call _aesni_enc1
  1768. movups STATE, (OUTP) # output
  1769. #ifndef __x86_64__
  1770. popl KLEN
  1771. popl KEYP
  1772. #endif
  1773. ret
  1774. ENDPROC(aesni_enc)
  1775. /*
  1776. * _aesni_enc1: internal ABI
  1777. * input:
  1778. * KEYP: key struct pointer
  1779. * KLEN: round count
  1780. * STATE: initial state (input)
  1781. * output:
  1782. * STATE: finial state (output)
  1783. * changed:
  1784. * KEY
  1785. * TKEYP (T1)
  1786. */
  1787. .align 4
  1788. _aesni_enc1:
  1789. movaps (KEYP), KEY # key
  1790. mov KEYP, TKEYP
  1791. pxor KEY, STATE # round 0
  1792. add $0x30, TKEYP
  1793. cmp $24, KLEN
  1794. jb .Lenc128
  1795. lea 0x20(TKEYP), TKEYP
  1796. je .Lenc192
  1797. add $0x20, TKEYP
  1798. movaps -0x60(TKEYP), KEY
  1799. AESENC KEY STATE
  1800. movaps -0x50(TKEYP), KEY
  1801. AESENC KEY STATE
  1802. .align 4
  1803. .Lenc192:
  1804. movaps -0x40(TKEYP), KEY
  1805. AESENC KEY STATE
  1806. movaps -0x30(TKEYP), KEY
  1807. AESENC KEY STATE
  1808. .align 4
  1809. .Lenc128:
  1810. movaps -0x20(TKEYP), KEY
  1811. AESENC KEY STATE
  1812. movaps -0x10(TKEYP), KEY
  1813. AESENC KEY STATE
  1814. movaps (TKEYP), KEY
  1815. AESENC KEY STATE
  1816. movaps 0x10(TKEYP), KEY
  1817. AESENC KEY STATE
  1818. movaps 0x20(TKEYP), KEY
  1819. AESENC KEY STATE
  1820. movaps 0x30(TKEYP), KEY
  1821. AESENC KEY STATE
  1822. movaps 0x40(TKEYP), KEY
  1823. AESENC KEY STATE
  1824. movaps 0x50(TKEYP), KEY
  1825. AESENC KEY STATE
  1826. movaps 0x60(TKEYP), KEY
  1827. AESENC KEY STATE
  1828. movaps 0x70(TKEYP), KEY
  1829. AESENCLAST KEY STATE
  1830. ret
  1831. ENDPROC(_aesni_enc1)
  1832. /*
  1833. * _aesni_enc4: internal ABI
  1834. * input:
  1835. * KEYP: key struct pointer
  1836. * KLEN: round count
  1837. * STATE1: initial state (input)
  1838. * STATE2
  1839. * STATE3
  1840. * STATE4
  1841. * output:
  1842. * STATE1: finial state (output)
  1843. * STATE2
  1844. * STATE3
  1845. * STATE4
  1846. * changed:
  1847. * KEY
  1848. * TKEYP (T1)
  1849. */
  1850. .align 4
  1851. _aesni_enc4:
  1852. movaps (KEYP), KEY # key
  1853. mov KEYP, TKEYP
  1854. pxor KEY, STATE1 # round 0
  1855. pxor KEY, STATE2
  1856. pxor KEY, STATE3
  1857. pxor KEY, STATE4
  1858. add $0x30, TKEYP
  1859. cmp $24, KLEN
  1860. jb .L4enc128
  1861. lea 0x20(TKEYP), TKEYP
  1862. je .L4enc192
  1863. add $0x20, TKEYP
  1864. movaps -0x60(TKEYP), KEY
  1865. AESENC KEY STATE1
  1866. AESENC KEY STATE2
  1867. AESENC KEY STATE3
  1868. AESENC KEY STATE4
  1869. movaps -0x50(TKEYP), KEY
  1870. AESENC KEY STATE1
  1871. AESENC KEY STATE2
  1872. AESENC KEY STATE3
  1873. AESENC KEY STATE4
  1874. #.align 4
  1875. .L4enc192:
  1876. movaps -0x40(TKEYP), KEY
  1877. AESENC KEY STATE1
  1878. AESENC KEY STATE2
  1879. AESENC KEY STATE3
  1880. AESENC KEY STATE4
  1881. movaps -0x30(TKEYP), KEY
  1882. AESENC KEY STATE1
  1883. AESENC KEY STATE2
  1884. AESENC KEY STATE3
  1885. AESENC KEY STATE4
  1886. #.align 4
  1887. .L4enc128:
  1888. movaps -0x20(TKEYP), KEY
  1889. AESENC KEY STATE1
  1890. AESENC KEY STATE2
  1891. AESENC KEY STATE3
  1892. AESENC KEY STATE4
  1893. movaps -0x10(TKEYP), KEY
  1894. AESENC KEY STATE1
  1895. AESENC KEY STATE2
  1896. AESENC KEY STATE3
  1897. AESENC KEY STATE4
  1898. movaps (TKEYP), KEY
  1899. AESENC KEY STATE1
  1900. AESENC KEY STATE2
  1901. AESENC KEY STATE3
  1902. AESENC KEY STATE4
  1903. movaps 0x10(TKEYP), KEY
  1904. AESENC KEY STATE1
  1905. AESENC KEY STATE2
  1906. AESENC KEY STATE3
  1907. AESENC KEY STATE4
  1908. movaps 0x20(TKEYP), KEY
  1909. AESENC KEY STATE1
  1910. AESENC KEY STATE2
  1911. AESENC KEY STATE3
  1912. AESENC KEY STATE4
  1913. movaps 0x30(TKEYP), KEY
  1914. AESENC KEY STATE1
  1915. AESENC KEY STATE2
  1916. AESENC KEY STATE3
  1917. AESENC KEY STATE4
  1918. movaps 0x40(TKEYP), KEY
  1919. AESENC KEY STATE1
  1920. AESENC KEY STATE2
  1921. AESENC KEY STATE3
  1922. AESENC KEY STATE4
  1923. movaps 0x50(TKEYP), KEY
  1924. AESENC KEY STATE1
  1925. AESENC KEY STATE2
  1926. AESENC KEY STATE3
  1927. AESENC KEY STATE4
  1928. movaps 0x60(TKEYP), KEY
  1929. AESENC KEY STATE1
  1930. AESENC KEY STATE2
  1931. AESENC KEY STATE3
  1932. AESENC KEY STATE4
  1933. movaps 0x70(TKEYP), KEY
  1934. AESENCLAST KEY STATE1 # last round
  1935. AESENCLAST KEY STATE2
  1936. AESENCLAST KEY STATE3
  1937. AESENCLAST KEY STATE4
  1938. ret
  1939. ENDPROC(_aesni_enc4)
  1940. /*
  1941. * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
  1942. */
  1943. ENTRY(aesni_dec)
  1944. #ifndef __x86_64__
  1945. pushl KEYP
  1946. pushl KLEN
  1947. movl 12(%esp), KEYP
  1948. movl 16(%esp), OUTP
  1949. movl 20(%esp), INP
  1950. #endif
  1951. mov 480(KEYP), KLEN # key length
  1952. add $240, KEYP
  1953. movups (INP), STATE # input
  1954. call _aesni_dec1
  1955. movups STATE, (OUTP) #output
  1956. #ifndef __x86_64__
  1957. popl KLEN
  1958. popl KEYP
  1959. #endif
  1960. ret
  1961. ENDPROC(aesni_dec)
  1962. /*
  1963. * _aesni_dec1: internal ABI
  1964. * input:
  1965. * KEYP: key struct pointer
  1966. * KLEN: key length
  1967. * STATE: initial state (input)
  1968. * output:
  1969. * STATE: finial state (output)
  1970. * changed:
  1971. * KEY
  1972. * TKEYP (T1)
  1973. */
  1974. .align 4
  1975. _aesni_dec1:
  1976. movaps (KEYP), KEY # key
  1977. mov KEYP, TKEYP
  1978. pxor KEY, STATE # round 0
  1979. add $0x30, TKEYP
  1980. cmp $24, KLEN
  1981. jb .Ldec128
  1982. lea 0x20(TKEYP), TKEYP
  1983. je .Ldec192
  1984. add $0x20, TKEYP
  1985. movaps -0x60(TKEYP), KEY
  1986. AESDEC KEY STATE
  1987. movaps -0x50(TKEYP), KEY
  1988. AESDEC KEY STATE
  1989. .align 4
  1990. .Ldec192:
  1991. movaps -0x40(TKEYP), KEY
  1992. AESDEC KEY STATE
  1993. movaps -0x30(TKEYP), KEY
  1994. AESDEC KEY STATE
  1995. .align 4
  1996. .Ldec128:
  1997. movaps -0x20(TKEYP), KEY
  1998. AESDEC KEY STATE
  1999. movaps -0x10(TKEYP), KEY
  2000. AESDEC KEY STATE
  2001. movaps (TKEYP), KEY
  2002. AESDEC KEY STATE
  2003. movaps 0x10(TKEYP), KEY
  2004. AESDEC KEY STATE
  2005. movaps 0x20(TKEYP), KEY
  2006. AESDEC KEY STATE
  2007. movaps 0x30(TKEYP), KEY
  2008. AESDEC KEY STATE
  2009. movaps 0x40(TKEYP), KEY
  2010. AESDEC KEY STATE
  2011. movaps 0x50(TKEYP), KEY
  2012. AESDEC KEY STATE
  2013. movaps 0x60(TKEYP), KEY
  2014. AESDEC KEY STATE
  2015. movaps 0x70(TKEYP), KEY
  2016. AESDECLAST KEY STATE
  2017. ret
  2018. ENDPROC(_aesni_dec1)
  2019. /*
  2020. * _aesni_dec4: internal ABI
  2021. * input:
  2022. * KEYP: key struct pointer
  2023. * KLEN: key length
  2024. * STATE1: initial state (input)
  2025. * STATE2
  2026. * STATE3
  2027. * STATE4
  2028. * output:
  2029. * STATE1: finial state (output)
  2030. * STATE2
  2031. * STATE3
  2032. * STATE4
  2033. * changed:
  2034. * KEY
  2035. * TKEYP (T1)
  2036. */
  2037. .align 4
  2038. _aesni_dec4:
  2039. movaps (KEYP), KEY # key
  2040. mov KEYP, TKEYP
  2041. pxor KEY, STATE1 # round 0
  2042. pxor KEY, STATE2
  2043. pxor KEY, STATE3
  2044. pxor KEY, STATE4
  2045. add $0x30, TKEYP
  2046. cmp $24, KLEN
  2047. jb .L4dec128
  2048. lea 0x20(TKEYP), TKEYP
  2049. je .L4dec192
  2050. add $0x20, TKEYP
  2051. movaps -0x60(TKEYP), KEY
  2052. AESDEC KEY STATE1
  2053. AESDEC KEY STATE2
  2054. AESDEC KEY STATE3
  2055. AESDEC KEY STATE4
  2056. movaps -0x50(TKEYP), KEY
  2057. AESDEC KEY STATE1
  2058. AESDEC KEY STATE2
  2059. AESDEC KEY STATE3
  2060. AESDEC KEY STATE4
  2061. .align 4
  2062. .L4dec192:
  2063. movaps -0x40(TKEYP), KEY
  2064. AESDEC KEY STATE1
  2065. AESDEC KEY STATE2
  2066. AESDEC KEY STATE3
  2067. AESDEC KEY STATE4
  2068. movaps -0x30(TKEYP), KEY
  2069. AESDEC KEY STATE1
  2070. AESDEC KEY STATE2
  2071. AESDEC KEY STATE3
  2072. AESDEC KEY STATE4
  2073. .align 4
  2074. .L4dec128:
  2075. movaps -0x20(TKEYP), KEY
  2076. AESDEC KEY STATE1
  2077. AESDEC KEY STATE2
  2078. AESDEC KEY STATE3
  2079. AESDEC KEY STATE4
  2080. movaps -0x10(TKEYP), KEY
  2081. AESDEC KEY STATE1
  2082. AESDEC KEY STATE2
  2083. AESDEC KEY STATE3
  2084. AESDEC KEY STATE4
  2085. movaps (TKEYP), KEY
  2086. AESDEC KEY STATE1
  2087. AESDEC KEY STATE2
  2088. AESDEC KEY STATE3
  2089. AESDEC KEY STATE4
  2090. movaps 0x10(TKEYP), KEY
  2091. AESDEC KEY STATE1
  2092. AESDEC KEY STATE2
  2093. AESDEC KEY STATE3
  2094. AESDEC KEY STATE4
  2095. movaps 0x20(TKEYP), KEY
  2096. AESDEC KEY STATE1
  2097. AESDEC KEY STATE2
  2098. AESDEC KEY STATE3
  2099. AESDEC KEY STATE4
  2100. movaps 0x30(TKEYP), KEY
  2101. AESDEC KEY STATE1
  2102. AESDEC KEY STATE2
  2103. AESDEC KEY STATE3
  2104. AESDEC KEY STATE4
  2105. movaps 0x40(TKEYP), KEY
  2106. AESDEC KEY STATE1
  2107. AESDEC KEY STATE2
  2108. AESDEC KEY STATE3
  2109. AESDEC KEY STATE4
  2110. movaps 0x50(TKEYP), KEY
  2111. AESDEC KEY STATE1
  2112. AESDEC KEY STATE2
  2113. AESDEC KEY STATE3
  2114. AESDEC KEY STATE4
  2115. movaps 0x60(TKEYP), KEY
  2116. AESDEC KEY STATE1
  2117. AESDEC KEY STATE2
  2118. AESDEC KEY STATE3
  2119. AESDEC KEY STATE4
  2120. movaps 0x70(TKEYP), KEY
  2121. AESDECLAST KEY STATE1 # last round
  2122. AESDECLAST KEY STATE2
  2123. AESDECLAST KEY STATE3
  2124. AESDECLAST KEY STATE4
  2125. ret
  2126. ENDPROC(_aesni_dec4)
  2127. /*
  2128. * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2129. * size_t len)
  2130. */
  2131. ENTRY(aesni_ecb_enc)
  2132. #ifndef __x86_64__
  2133. pushl LEN
  2134. pushl KEYP
  2135. pushl KLEN
  2136. movl 16(%esp), KEYP
  2137. movl 20(%esp), OUTP
  2138. movl 24(%esp), INP
  2139. movl 28(%esp), LEN
  2140. #endif
  2141. test LEN, LEN # check length
  2142. jz .Lecb_enc_ret
  2143. mov 480(KEYP), KLEN
  2144. cmp $16, LEN
  2145. jb .Lecb_enc_ret
  2146. cmp $64, LEN
  2147. jb .Lecb_enc_loop1
  2148. .align 4
  2149. .Lecb_enc_loop4:
  2150. movups (INP), STATE1
  2151. movups 0x10(INP), STATE2
  2152. movups 0x20(INP), STATE3
  2153. movups 0x30(INP), STATE4
  2154. call _aesni_enc4
  2155. movups STATE1, (OUTP)
  2156. movups STATE2, 0x10(OUTP)
  2157. movups STATE3, 0x20(OUTP)
  2158. movups STATE4, 0x30(OUTP)
  2159. sub $64, LEN
  2160. add $64, INP
  2161. add $64, OUTP
  2162. cmp $64, LEN
  2163. jge .Lecb_enc_loop4
  2164. cmp $16, LEN
  2165. jb .Lecb_enc_ret
  2166. .align 4
  2167. .Lecb_enc_loop1:
  2168. movups (INP), STATE1
  2169. call _aesni_enc1
  2170. movups STATE1, (OUTP)
  2171. sub $16, LEN
  2172. add $16, INP
  2173. add $16, OUTP
  2174. cmp $16, LEN
  2175. jge .Lecb_enc_loop1
  2176. .Lecb_enc_ret:
  2177. #ifndef __x86_64__
  2178. popl KLEN
  2179. popl KEYP
  2180. popl LEN
  2181. #endif
  2182. ret
  2183. ENDPROC(aesni_ecb_enc)
  2184. /*
  2185. * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2186. * size_t len);
  2187. */
  2188. ENTRY(aesni_ecb_dec)
  2189. #ifndef __x86_64__
  2190. pushl LEN
  2191. pushl KEYP
  2192. pushl KLEN
  2193. movl 16(%esp), KEYP
  2194. movl 20(%esp), OUTP
  2195. movl 24(%esp), INP
  2196. movl 28(%esp), LEN
  2197. #endif
  2198. test LEN, LEN
  2199. jz .Lecb_dec_ret
  2200. mov 480(KEYP), KLEN
  2201. add $240, KEYP
  2202. cmp $16, LEN
  2203. jb .Lecb_dec_ret
  2204. cmp $64, LEN
  2205. jb .Lecb_dec_loop1
  2206. .align 4
  2207. .Lecb_dec_loop4:
  2208. movups (INP), STATE1
  2209. movups 0x10(INP), STATE2
  2210. movups 0x20(INP), STATE3
  2211. movups 0x30(INP), STATE4
  2212. call _aesni_dec4
  2213. movups STATE1, (OUTP)
  2214. movups STATE2, 0x10(OUTP)
  2215. movups STATE3, 0x20(OUTP)
  2216. movups STATE4, 0x30(OUTP)
  2217. sub $64, LEN
  2218. add $64, INP
  2219. add $64, OUTP
  2220. cmp $64, LEN
  2221. jge .Lecb_dec_loop4
  2222. cmp $16, LEN
  2223. jb .Lecb_dec_ret
  2224. .align 4
  2225. .Lecb_dec_loop1:
  2226. movups (INP), STATE1
  2227. call _aesni_dec1
  2228. movups STATE1, (OUTP)
  2229. sub $16, LEN
  2230. add $16, INP
  2231. add $16, OUTP
  2232. cmp $16, LEN
  2233. jge .Lecb_dec_loop1
  2234. .Lecb_dec_ret:
  2235. #ifndef __x86_64__
  2236. popl KLEN
  2237. popl KEYP
  2238. popl LEN
  2239. #endif
  2240. ret
  2241. ENDPROC(aesni_ecb_dec)
  2242. /*
  2243. * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2244. * size_t len, u8 *iv)
  2245. */
  2246. ENTRY(aesni_cbc_enc)
  2247. #ifndef __x86_64__
  2248. pushl IVP
  2249. pushl LEN
  2250. pushl KEYP
  2251. pushl KLEN
  2252. movl 20(%esp), KEYP
  2253. movl 24(%esp), OUTP
  2254. movl 28(%esp), INP
  2255. movl 32(%esp), LEN
  2256. movl 36(%esp), IVP
  2257. #endif
  2258. cmp $16, LEN
  2259. jb .Lcbc_enc_ret
  2260. mov 480(KEYP), KLEN
  2261. movups (IVP), STATE # load iv as initial state
  2262. .align 4
  2263. .Lcbc_enc_loop:
  2264. movups (INP), IN # load input
  2265. pxor IN, STATE
  2266. call _aesni_enc1
  2267. movups STATE, (OUTP) # store output
  2268. sub $16, LEN
  2269. add $16, INP
  2270. add $16, OUTP
  2271. cmp $16, LEN
  2272. jge .Lcbc_enc_loop
  2273. movups STATE, (IVP)
  2274. .Lcbc_enc_ret:
  2275. #ifndef __x86_64__
  2276. popl KLEN
  2277. popl KEYP
  2278. popl LEN
  2279. popl IVP
  2280. #endif
  2281. ret
  2282. ENDPROC(aesni_cbc_enc)
  2283. /*
  2284. * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2285. * size_t len, u8 *iv)
  2286. */
  2287. ENTRY(aesni_cbc_dec)
  2288. #ifndef __x86_64__
  2289. pushl IVP
  2290. pushl LEN
  2291. pushl KEYP
  2292. pushl KLEN
  2293. movl 20(%esp), KEYP
  2294. movl 24(%esp), OUTP
  2295. movl 28(%esp), INP
  2296. movl 32(%esp), LEN
  2297. movl 36(%esp), IVP
  2298. #endif
  2299. cmp $16, LEN
  2300. jb .Lcbc_dec_just_ret
  2301. mov 480(KEYP), KLEN
  2302. add $240, KEYP
  2303. movups (IVP), IV
  2304. cmp $64, LEN
  2305. jb .Lcbc_dec_loop1
  2306. .align 4
  2307. .Lcbc_dec_loop4:
  2308. movups (INP), IN1
  2309. movaps IN1, STATE1
  2310. movups 0x10(INP), IN2
  2311. movaps IN2, STATE2
  2312. #ifdef __x86_64__
  2313. movups 0x20(INP), IN3
  2314. movaps IN3, STATE3
  2315. movups 0x30(INP), IN4
  2316. movaps IN4, STATE4
  2317. #else
  2318. movups 0x20(INP), IN1
  2319. movaps IN1, STATE3
  2320. movups 0x30(INP), IN2
  2321. movaps IN2, STATE4
  2322. #endif
  2323. call _aesni_dec4
  2324. pxor IV, STATE1
  2325. #ifdef __x86_64__
  2326. pxor IN1, STATE2
  2327. pxor IN2, STATE3
  2328. pxor IN3, STATE4
  2329. movaps IN4, IV
  2330. #else
  2331. pxor IN1, STATE4
  2332. movaps IN2, IV
  2333. movups (INP), IN1
  2334. pxor IN1, STATE2
  2335. movups 0x10(INP), IN2
  2336. pxor IN2, STATE3
  2337. #endif
  2338. movups STATE1, (OUTP)
  2339. movups STATE2, 0x10(OUTP)
  2340. movups STATE3, 0x20(OUTP)
  2341. movups STATE4, 0x30(OUTP)
  2342. sub $64, LEN
  2343. add $64, INP
  2344. add $64, OUTP
  2345. cmp $64, LEN
  2346. jge .Lcbc_dec_loop4
  2347. cmp $16, LEN
  2348. jb .Lcbc_dec_ret
  2349. .align 4
  2350. .Lcbc_dec_loop1:
  2351. movups (INP), IN
  2352. movaps IN, STATE
  2353. call _aesni_dec1
  2354. pxor IV, STATE
  2355. movups STATE, (OUTP)
  2356. movaps IN, IV
  2357. sub $16, LEN
  2358. add $16, INP
  2359. add $16, OUTP
  2360. cmp $16, LEN
  2361. jge .Lcbc_dec_loop1
  2362. .Lcbc_dec_ret:
  2363. movups IV, (IVP)
  2364. .Lcbc_dec_just_ret:
  2365. #ifndef __x86_64__
  2366. popl KLEN
  2367. popl KEYP
  2368. popl LEN
  2369. popl IVP
  2370. #endif
  2371. ret
  2372. ENDPROC(aesni_cbc_dec)
  2373. #ifdef __x86_64__
  2374. .align 16
  2375. .Lbswap_mask:
  2376. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  2377. /*
  2378. * _aesni_inc_init: internal ABI
  2379. * setup registers used by _aesni_inc
  2380. * input:
  2381. * IV
  2382. * output:
  2383. * CTR: == IV, in little endian
  2384. * TCTR_LOW: == lower qword of CTR
  2385. * INC: == 1, in little endian
  2386. * BSWAP_MASK == endian swapping mask
  2387. */
  2388. .align 4
  2389. _aesni_inc_init:
  2390. movaps .Lbswap_mask, BSWAP_MASK
  2391. movaps IV, CTR
  2392. PSHUFB_XMM BSWAP_MASK CTR
  2393. mov $1, TCTR_LOW
  2394. MOVQ_R64_XMM TCTR_LOW INC
  2395. MOVQ_R64_XMM CTR TCTR_LOW
  2396. ret
  2397. ENDPROC(_aesni_inc_init)
  2398. /*
  2399. * _aesni_inc: internal ABI
  2400. * Increase IV by 1, IV is in big endian
  2401. * input:
  2402. * IV
  2403. * CTR: == IV, in little endian
  2404. * TCTR_LOW: == lower qword of CTR
  2405. * INC: == 1, in little endian
  2406. * BSWAP_MASK == endian swapping mask
  2407. * output:
  2408. * IV: Increase by 1
  2409. * changed:
  2410. * CTR: == output IV, in little endian
  2411. * TCTR_LOW: == lower qword of CTR
  2412. */
  2413. .align 4
  2414. _aesni_inc:
  2415. paddq INC, CTR
  2416. add $1, TCTR_LOW
  2417. jnc .Linc_low
  2418. pslldq $8, INC
  2419. paddq INC, CTR
  2420. psrldq $8, INC
  2421. .Linc_low:
  2422. movaps CTR, IV
  2423. PSHUFB_XMM BSWAP_MASK IV
  2424. ret
  2425. ENDPROC(_aesni_inc)
  2426. /*
  2427. * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2428. * size_t len, u8 *iv)
  2429. */
  2430. ENTRY(aesni_ctr_enc)
  2431. cmp $16, LEN
  2432. jb .Lctr_enc_just_ret
  2433. mov 480(KEYP), KLEN
  2434. movups (IVP), IV
  2435. call _aesni_inc_init
  2436. cmp $64, LEN
  2437. jb .Lctr_enc_loop1
  2438. .align 4
  2439. .Lctr_enc_loop4:
  2440. movaps IV, STATE1
  2441. call _aesni_inc
  2442. movups (INP), IN1
  2443. movaps IV, STATE2
  2444. call _aesni_inc
  2445. movups 0x10(INP), IN2
  2446. movaps IV, STATE3
  2447. call _aesni_inc
  2448. movups 0x20(INP), IN3
  2449. movaps IV, STATE4
  2450. call _aesni_inc
  2451. movups 0x30(INP), IN4
  2452. call _aesni_enc4
  2453. pxor IN1, STATE1
  2454. movups STATE1, (OUTP)
  2455. pxor IN2, STATE2
  2456. movups STATE2, 0x10(OUTP)
  2457. pxor IN3, STATE3
  2458. movups STATE3, 0x20(OUTP)
  2459. pxor IN4, STATE4
  2460. movups STATE4, 0x30(OUTP)
  2461. sub $64, LEN
  2462. add $64, INP
  2463. add $64, OUTP
  2464. cmp $64, LEN
  2465. jge .Lctr_enc_loop4
  2466. cmp $16, LEN
  2467. jb .Lctr_enc_ret
  2468. .align 4
  2469. .Lctr_enc_loop1:
  2470. movaps IV, STATE
  2471. call _aesni_inc
  2472. movups (INP), IN
  2473. call _aesni_enc1
  2474. pxor IN, STATE
  2475. movups STATE, (OUTP)
  2476. sub $16, LEN
  2477. add $16, INP
  2478. add $16, OUTP
  2479. cmp $16, LEN
  2480. jge .Lctr_enc_loop1
  2481. .Lctr_enc_ret:
  2482. movups IV, (IVP)
  2483. .Lctr_enc_just_ret:
  2484. ret
  2485. ENDPROC(aesni_ctr_enc)
  2486. /*
  2487. * _aesni_gf128mul_x_ble: internal ABI
  2488. * Multiply in GF(2^128) for XTS IVs
  2489. * input:
  2490. * IV: current IV
  2491. * GF128MUL_MASK == mask with 0x87 and 0x01
  2492. * output:
  2493. * IV: next IV
  2494. * changed:
  2495. * CTR: == temporary value
  2496. */
  2497. #define _aesni_gf128mul_x_ble() \
  2498. pshufd $0x13, IV, CTR; \
  2499. paddq IV, IV; \
  2500. psrad $31, CTR; \
  2501. pand GF128MUL_MASK, CTR; \
  2502. pxor CTR, IV;
  2503. /*
  2504. * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
  2505. * bool enc, u8 *iv)
  2506. */
  2507. ENTRY(aesni_xts_crypt8)
  2508. cmpb $0, %cl
  2509. movl $0, %ecx
  2510. movl $240, %r10d
  2511. leaq _aesni_enc4, %r11
  2512. leaq _aesni_dec4, %rax
  2513. cmovel %r10d, %ecx
  2514. cmoveq %rax, %r11
  2515. movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2516. movups (IVP), IV
  2517. mov 480(KEYP), KLEN
  2518. addq %rcx, KEYP
  2519. movdqa IV, STATE1
  2520. movdqu 0x00(INP), INC
  2521. pxor INC, STATE1
  2522. movdqu IV, 0x00(OUTP)
  2523. _aesni_gf128mul_x_ble()
  2524. movdqa IV, STATE2
  2525. movdqu 0x10(INP), INC
  2526. pxor INC, STATE2
  2527. movdqu IV, 0x10(OUTP)
  2528. _aesni_gf128mul_x_ble()
  2529. movdqa IV, STATE3
  2530. movdqu 0x20(INP), INC
  2531. pxor INC, STATE3
  2532. movdqu IV, 0x20(OUTP)
  2533. _aesni_gf128mul_x_ble()
  2534. movdqa IV, STATE4
  2535. movdqu 0x30(INP), INC
  2536. pxor INC, STATE4
  2537. movdqu IV, 0x30(OUTP)
  2538. call *%r11
  2539. movdqu 0x00(OUTP), INC
  2540. pxor INC, STATE1
  2541. movdqu STATE1, 0x00(OUTP)
  2542. _aesni_gf128mul_x_ble()
  2543. movdqa IV, STATE1
  2544. movdqu 0x40(INP), INC
  2545. pxor INC, STATE1
  2546. movdqu IV, 0x40(OUTP)
  2547. movdqu 0x10(OUTP), INC
  2548. pxor INC, STATE2
  2549. movdqu STATE2, 0x10(OUTP)
  2550. _aesni_gf128mul_x_ble()
  2551. movdqa IV, STATE2
  2552. movdqu 0x50(INP), INC
  2553. pxor INC, STATE2
  2554. movdqu IV, 0x50(OUTP)
  2555. movdqu 0x20(OUTP), INC
  2556. pxor INC, STATE3
  2557. movdqu STATE3, 0x20(OUTP)
  2558. _aesni_gf128mul_x_ble()
  2559. movdqa IV, STATE3
  2560. movdqu 0x60(INP), INC
  2561. pxor INC, STATE3
  2562. movdqu IV, 0x60(OUTP)
  2563. movdqu 0x30(OUTP), INC
  2564. pxor INC, STATE4
  2565. movdqu STATE4, 0x30(OUTP)
  2566. _aesni_gf128mul_x_ble()
  2567. movdqa IV, STATE4
  2568. movdqu 0x70(INP), INC
  2569. pxor INC, STATE4
  2570. movdqu IV, 0x70(OUTP)
  2571. _aesni_gf128mul_x_ble()
  2572. movups IV, (IVP)
  2573. call *%r11
  2574. movdqu 0x40(OUTP), INC
  2575. pxor INC, STATE1
  2576. movdqu STATE1, 0x40(OUTP)
  2577. movdqu 0x50(OUTP), INC
  2578. pxor INC, STATE2
  2579. movdqu STATE2, 0x50(OUTP)
  2580. movdqu 0x60(OUTP), INC
  2581. pxor INC, STATE3
  2582. movdqu STATE3, 0x60(OUTP)
  2583. movdqu 0x70(OUTP), INC
  2584. pxor INC, STATE4
  2585. movdqu STATE4, 0x70(OUTP)
  2586. ret
  2587. ENDPROC(aesni_xts_crypt8)
  2588. #endif