twofish-avx2-asm_64.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. /*
  2. * x86_64/AVX2 assembler optimized version of Twofish
  3. *
  4. * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. */
  12. #include <linux/linkage.h>
  13. #include "glue_helper-asm-avx2.S"
  14. .file "twofish-avx2-asm_64.S"
  15. .data
  16. .align 16
  17. .Lvpshufb_mask0:
  18. .long 0x80808000
  19. .long 0x80808004
  20. .long 0x80808008
  21. .long 0x8080800c
  22. .Lbswap128_mask:
  23. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  24. .Lxts_gf128mul_and_shl1_mask_0:
  25. .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  26. .Lxts_gf128mul_and_shl1_mask_1:
  27. .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
  28. .text
  29. /* structure of crypto context */
  30. #define s0 0
  31. #define s1 1024
  32. #define s2 2048
  33. #define s3 3072
  34. #define w 4096
  35. #define k 4128
  36. /* register macros */
  37. #define CTX %rdi
  38. #define RS0 CTX
  39. #define RS1 %r8
  40. #define RS2 %r9
  41. #define RS3 %r10
  42. #define RK %r11
  43. #define RW %rax
  44. #define RROUND %r12
  45. #define RROUNDd %r12d
  46. #define RA0 %ymm8
  47. #define RB0 %ymm9
  48. #define RC0 %ymm10
  49. #define RD0 %ymm11
  50. #define RA1 %ymm12
  51. #define RB1 %ymm13
  52. #define RC1 %ymm14
  53. #define RD1 %ymm15
  54. /* temp regs */
  55. #define RX0 %ymm0
  56. #define RY0 %ymm1
  57. #define RX1 %ymm2
  58. #define RY1 %ymm3
  59. #define RT0 %ymm4
  60. #define RIDX %ymm5
  61. #define RX0x %xmm0
  62. #define RY0x %xmm1
  63. #define RX1x %xmm2
  64. #define RY1x %xmm3
  65. #define RT0x %xmm4
  66. /* vpgatherdd mask and '-1' */
  67. #define RNOT %ymm6
  68. /* byte mask, (-1 >> 24) */
  69. #define RBYTE %ymm7
  70. /**********************************************************************
  71. 16-way AVX2 twofish
  72. **********************************************************************/
  73. #define init_round_constants() \
  74. vpcmpeqd RNOT, RNOT, RNOT; \
  75. vpsrld $24, RNOT, RBYTE; \
  76. leaq k(CTX), RK; \
  77. leaq w(CTX), RW; \
  78. leaq s1(CTX), RS1; \
  79. leaq s2(CTX), RS2; \
  80. leaq s3(CTX), RS3; \
  81. #define g16(ab, rs0, rs1, rs2, rs3, xy) \
  82. vpand RBYTE, ab ## 0, RIDX; \
  83. vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
  84. vpcmpeqd RNOT, RNOT, RNOT; \
  85. \
  86. vpand RBYTE, ab ## 1, RIDX; \
  87. vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
  88. vpcmpeqd RNOT, RNOT, RNOT; \
  89. \
  90. vpsrld $8, ab ## 0, RIDX; \
  91. vpand RBYTE, RIDX, RIDX; \
  92. vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
  93. vpcmpeqd RNOT, RNOT, RNOT; \
  94. vpxor RT0, xy ## 0, xy ## 0; \
  95. \
  96. vpsrld $8, ab ## 1, RIDX; \
  97. vpand RBYTE, RIDX, RIDX; \
  98. vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
  99. vpcmpeqd RNOT, RNOT, RNOT; \
  100. vpxor RT0, xy ## 1, xy ## 1; \
  101. \
  102. vpsrld $16, ab ## 0, RIDX; \
  103. vpand RBYTE, RIDX, RIDX; \
  104. vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
  105. vpcmpeqd RNOT, RNOT, RNOT; \
  106. vpxor RT0, xy ## 0, xy ## 0; \
  107. \
  108. vpsrld $16, ab ## 1, RIDX; \
  109. vpand RBYTE, RIDX, RIDX; \
  110. vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
  111. vpcmpeqd RNOT, RNOT, RNOT; \
  112. vpxor RT0, xy ## 1, xy ## 1; \
  113. \
  114. vpsrld $24, ab ## 0, RIDX; \
  115. vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
  116. vpcmpeqd RNOT, RNOT, RNOT; \
  117. vpxor RT0, xy ## 0, xy ## 0; \
  118. \
  119. vpsrld $24, ab ## 1, RIDX; \
  120. vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
  121. vpcmpeqd RNOT, RNOT, RNOT; \
  122. vpxor RT0, xy ## 1, xy ## 1;
  123. #define g1_16(a, x) \
  124. g16(a, RS0, RS1, RS2, RS3, x);
  125. #define g2_16(b, y) \
  126. g16(b, RS1, RS2, RS3, RS0, y);
  127. #define encrypt_round_end16(a, b, c, d, nk) \
  128. vpaddd RY0, RX0, RX0; \
  129. vpaddd RX0, RY0, RY0; \
  130. vpbroadcastd nk(RK,RROUND,8), RT0; \
  131. vpaddd RT0, RX0, RX0; \
  132. vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
  133. vpaddd RT0, RY0, RY0; \
  134. \
  135. vpxor RY0, d ## 0, d ## 0; \
  136. \
  137. vpxor RX0, c ## 0, c ## 0; \
  138. vpsrld $1, c ## 0, RT0; \
  139. vpslld $31, c ## 0, c ## 0; \
  140. vpor RT0, c ## 0, c ## 0; \
  141. \
  142. vpaddd RY1, RX1, RX1; \
  143. vpaddd RX1, RY1, RY1; \
  144. vpbroadcastd nk(RK,RROUND,8), RT0; \
  145. vpaddd RT0, RX1, RX1; \
  146. vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
  147. vpaddd RT0, RY1, RY1; \
  148. \
  149. vpxor RY1, d ## 1, d ## 1; \
  150. \
  151. vpxor RX1, c ## 1, c ## 1; \
  152. vpsrld $1, c ## 1, RT0; \
  153. vpslld $31, c ## 1, c ## 1; \
  154. vpor RT0, c ## 1, c ## 1; \
  155. #define encrypt_round16(a, b, c, d, nk) \
  156. g2_16(b, RY); \
  157. \
  158. vpslld $1, b ## 0, RT0; \
  159. vpsrld $31, b ## 0, b ## 0; \
  160. vpor RT0, b ## 0, b ## 0; \
  161. \
  162. vpslld $1, b ## 1, RT0; \
  163. vpsrld $31, b ## 1, b ## 1; \
  164. vpor RT0, b ## 1, b ## 1; \
  165. \
  166. g1_16(a, RX); \
  167. \
  168. encrypt_round_end16(a, b, c, d, nk);
  169. #define encrypt_round_first16(a, b, c, d, nk) \
  170. vpslld $1, d ## 0, RT0; \
  171. vpsrld $31, d ## 0, d ## 0; \
  172. vpor RT0, d ## 0, d ## 0; \
  173. \
  174. vpslld $1, d ## 1, RT0; \
  175. vpsrld $31, d ## 1, d ## 1; \
  176. vpor RT0, d ## 1, d ## 1; \
  177. \
  178. encrypt_round16(a, b, c, d, nk);
  179. #define encrypt_round_last16(a, b, c, d, nk) \
  180. g2_16(b, RY); \
  181. \
  182. g1_16(a, RX); \
  183. \
  184. encrypt_round_end16(a, b, c, d, nk);
  185. #define decrypt_round_end16(a, b, c, d, nk) \
  186. vpaddd RY0, RX0, RX0; \
  187. vpaddd RX0, RY0, RY0; \
  188. vpbroadcastd nk(RK,RROUND,8), RT0; \
  189. vpaddd RT0, RX0, RX0; \
  190. vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
  191. vpaddd RT0, RY0, RY0; \
  192. \
  193. vpxor RX0, c ## 0, c ## 0; \
  194. \
  195. vpxor RY0, d ## 0, d ## 0; \
  196. vpsrld $1, d ## 0, RT0; \
  197. vpslld $31, d ## 0, d ## 0; \
  198. vpor RT0, d ## 0, d ## 0; \
  199. \
  200. vpaddd RY1, RX1, RX1; \
  201. vpaddd RX1, RY1, RY1; \
  202. vpbroadcastd nk(RK,RROUND,8), RT0; \
  203. vpaddd RT0, RX1, RX1; \
  204. vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
  205. vpaddd RT0, RY1, RY1; \
  206. \
  207. vpxor RX1, c ## 1, c ## 1; \
  208. \
  209. vpxor RY1, d ## 1, d ## 1; \
  210. vpsrld $1, d ## 1, RT0; \
  211. vpslld $31, d ## 1, d ## 1; \
  212. vpor RT0, d ## 1, d ## 1;
  213. #define decrypt_round16(a, b, c, d, nk) \
  214. g1_16(a, RX); \
  215. \
  216. vpslld $1, a ## 0, RT0; \
  217. vpsrld $31, a ## 0, a ## 0; \
  218. vpor RT0, a ## 0, a ## 0; \
  219. \
  220. vpslld $1, a ## 1, RT0; \
  221. vpsrld $31, a ## 1, a ## 1; \
  222. vpor RT0, a ## 1, a ## 1; \
  223. \
  224. g2_16(b, RY); \
  225. \
  226. decrypt_round_end16(a, b, c, d, nk);
  227. #define decrypt_round_first16(a, b, c, d, nk) \
  228. vpslld $1, c ## 0, RT0; \
  229. vpsrld $31, c ## 0, c ## 0; \
  230. vpor RT0, c ## 0, c ## 0; \
  231. \
  232. vpslld $1, c ## 1, RT0; \
  233. vpsrld $31, c ## 1, c ## 1; \
  234. vpor RT0, c ## 1, c ## 1; \
  235. \
  236. decrypt_round16(a, b, c, d, nk)
  237. #define decrypt_round_last16(a, b, c, d, nk) \
  238. g1_16(a, RX); \
  239. \
  240. g2_16(b, RY); \
  241. \
  242. decrypt_round_end16(a, b, c, d, nk);
  243. #define encrypt_cycle16() \
  244. encrypt_round16(RA, RB, RC, RD, 0); \
  245. encrypt_round16(RC, RD, RA, RB, 8);
  246. #define encrypt_cycle_first16() \
  247. encrypt_round_first16(RA, RB, RC, RD, 0); \
  248. encrypt_round16(RC, RD, RA, RB, 8);
  249. #define encrypt_cycle_last16() \
  250. encrypt_round16(RA, RB, RC, RD, 0); \
  251. encrypt_round_last16(RC, RD, RA, RB, 8);
  252. #define decrypt_cycle16(n) \
  253. decrypt_round16(RC, RD, RA, RB, 8); \
  254. decrypt_round16(RA, RB, RC, RD, 0);
  255. #define decrypt_cycle_first16(n) \
  256. decrypt_round_first16(RC, RD, RA, RB, 8); \
  257. decrypt_round16(RA, RB, RC, RD, 0);
  258. #define decrypt_cycle_last16(n) \
  259. decrypt_round16(RC, RD, RA, RB, 8); \
  260. decrypt_round_last16(RA, RB, RC, RD, 0);
  261. #define transpose_4x4(x0,x1,x2,x3,t1,t2) \
  262. vpunpckhdq x1, x0, t2; \
  263. vpunpckldq x1, x0, x0; \
  264. \
  265. vpunpckldq x3, x2, t1; \
  266. vpunpckhdq x3, x2, x2; \
  267. \
  268. vpunpckhqdq t1, x0, x1; \
  269. vpunpcklqdq t1, x0, x0; \
  270. \
  271. vpunpckhqdq x2, t2, x3; \
  272. vpunpcklqdq x2, t2, x2;
  273. #define read_blocks8(offs,a,b,c,d) \
  274. transpose_4x4(a, b, c, d, RX0, RY0);
  275. #define write_blocks8(offs,a,b,c,d) \
  276. transpose_4x4(a, b, c, d, RX0, RY0);
  277. #define inpack_enc8(a,b,c,d) \
  278. vpbroadcastd 4*0(RW), RT0; \
  279. vpxor RT0, a, a; \
  280. \
  281. vpbroadcastd 4*1(RW), RT0; \
  282. vpxor RT0, b, b; \
  283. \
  284. vpbroadcastd 4*2(RW), RT0; \
  285. vpxor RT0, c, c; \
  286. \
  287. vpbroadcastd 4*3(RW), RT0; \
  288. vpxor RT0, d, d;
  289. #define outunpack_enc8(a,b,c,d) \
  290. vpbroadcastd 4*4(RW), RX0; \
  291. vpbroadcastd 4*5(RW), RY0; \
  292. vpxor RX0, c, RX0; \
  293. vpxor RY0, d, RY0; \
  294. \
  295. vpbroadcastd 4*6(RW), RT0; \
  296. vpxor RT0, a, c; \
  297. vpbroadcastd 4*7(RW), RT0; \
  298. vpxor RT0, b, d; \
  299. \
  300. vmovdqa RX0, a; \
  301. vmovdqa RY0, b;
  302. #define inpack_dec8(a,b,c,d) \
  303. vpbroadcastd 4*4(RW), RX0; \
  304. vpbroadcastd 4*5(RW), RY0; \
  305. vpxor RX0, a, RX0; \
  306. vpxor RY0, b, RY0; \
  307. \
  308. vpbroadcastd 4*6(RW), RT0; \
  309. vpxor RT0, c, a; \
  310. vpbroadcastd 4*7(RW), RT0; \
  311. vpxor RT0, d, b; \
  312. \
  313. vmovdqa RX0, c; \
  314. vmovdqa RY0, d;
  315. #define outunpack_dec8(a,b,c,d) \
  316. vpbroadcastd 4*0(RW), RT0; \
  317. vpxor RT0, a, a; \
  318. \
  319. vpbroadcastd 4*1(RW), RT0; \
  320. vpxor RT0, b, b; \
  321. \
  322. vpbroadcastd 4*2(RW), RT0; \
  323. vpxor RT0, c, c; \
  324. \
  325. vpbroadcastd 4*3(RW), RT0; \
  326. vpxor RT0, d, d;
  327. #define read_blocks16(a,b,c,d) \
  328. read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
  329. read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
  330. #define write_blocks16(a,b,c,d) \
  331. write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
  332. write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
  333. #define xor_blocks16(a,b,c,d) \
  334. xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
  335. xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
  336. #define inpack_enc16(a,b,c,d) \
  337. inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
  338. inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
  339. #define outunpack_enc16(a,b,c,d) \
  340. outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
  341. outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
  342. #define inpack_dec16(a,b,c,d) \
  343. inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
  344. inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
  345. #define outunpack_dec16(a,b,c,d) \
  346. outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
  347. outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
  348. .align 8
  349. __twofish_enc_blk16:
  350. /* input:
  351. * %rdi: ctx, CTX
  352. * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
  353. * output:
  354. * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
  355. */
  356. init_round_constants();
  357. read_blocks16(RA, RB, RC, RD);
  358. inpack_enc16(RA, RB, RC, RD);
  359. xorl RROUNDd, RROUNDd;
  360. encrypt_cycle_first16();
  361. movl $2, RROUNDd;
  362. .align 4
  363. .L__enc_loop:
  364. encrypt_cycle16();
  365. addl $2, RROUNDd;
  366. cmpl $14, RROUNDd;
  367. jne .L__enc_loop;
  368. encrypt_cycle_last16();
  369. outunpack_enc16(RA, RB, RC, RD);
  370. write_blocks16(RA, RB, RC, RD);
  371. ret;
  372. ENDPROC(__twofish_enc_blk16)
  373. .align 8
  374. __twofish_dec_blk16:
  375. /* input:
  376. * %rdi: ctx, CTX
  377. * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
  378. * output:
  379. * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
  380. */
  381. init_round_constants();
  382. read_blocks16(RA, RB, RC, RD);
  383. inpack_dec16(RA, RB, RC, RD);
  384. movl $14, RROUNDd;
  385. decrypt_cycle_first16();
  386. movl $12, RROUNDd;
  387. .align 4
  388. .L__dec_loop:
  389. decrypt_cycle16();
  390. addl $-2, RROUNDd;
  391. jnz .L__dec_loop;
  392. decrypt_cycle_last16();
  393. outunpack_dec16(RA, RB, RC, RD);
  394. write_blocks16(RA, RB, RC, RD);
  395. ret;
  396. ENDPROC(__twofish_dec_blk16)
  397. ENTRY(twofish_ecb_enc_16way)
  398. /* input:
  399. * %rdi: ctx, CTX
  400. * %rsi: dst
  401. * %rdx: src
  402. */
  403. vzeroupper;
  404. pushq %r12;
  405. load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  406. call __twofish_enc_blk16;
  407. store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  408. popq %r12;
  409. vzeroupper;
  410. ret;
  411. ENDPROC(twofish_ecb_enc_16way)
  412. ENTRY(twofish_ecb_dec_16way)
  413. /* input:
  414. * %rdi: ctx, CTX
  415. * %rsi: dst
  416. * %rdx: src
  417. */
  418. vzeroupper;
  419. pushq %r12;
  420. load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  421. call __twofish_dec_blk16;
  422. store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  423. popq %r12;
  424. vzeroupper;
  425. ret;
  426. ENDPROC(twofish_ecb_dec_16way)
  427. ENTRY(twofish_cbc_dec_16way)
  428. /* input:
  429. * %rdi: ctx, CTX
  430. * %rsi: dst
  431. * %rdx: src
  432. */
  433. vzeroupper;
  434. pushq %r12;
  435. load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  436. call __twofish_dec_blk16;
  437. store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
  438. RX0);
  439. popq %r12;
  440. vzeroupper;
  441. ret;
  442. ENDPROC(twofish_cbc_dec_16way)
  443. ENTRY(twofish_ctr_16way)
  444. /* input:
  445. * %rdi: ctx, CTX
  446. * %rsi: dst (16 blocks)
  447. * %rdx: src (16 blocks)
  448. * %rcx: iv (little endian, 128bit)
  449. */
  450. vzeroupper;
  451. pushq %r12;
  452. load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
  453. RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
  454. RBYTE);
  455. call __twofish_enc_blk16;
  456. store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  457. popq %r12;
  458. vzeroupper;
  459. ret;
  460. ENDPROC(twofish_ctr_16way)
  461. .align 8
  462. twofish_xts_crypt_16way:
  463. /* input:
  464. * %rdi: ctx, CTX
  465. * %rsi: dst (16 blocks)
  466. * %rdx: src (16 blocks)
  467. * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
  468. * %r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
  469. */
  470. vzeroupper;
  471. pushq %r12;
  472. load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
  473. RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
  474. .Lxts_gf128mul_and_shl1_mask_0,
  475. .Lxts_gf128mul_and_shl1_mask_1);
  476. call *%r8;
  477. store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
  478. popq %r12;
  479. vzeroupper;
  480. ret;
  481. ENDPROC(twofish_xts_crypt_16way)
  482. ENTRY(twofish_xts_enc_16way)
  483. /* input:
  484. * %rdi: ctx, CTX
  485. * %rsi: dst (16 blocks)
  486. * %rdx: src (16 blocks)
  487. * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
  488. */
  489. leaq __twofish_enc_blk16, %r8;
  490. jmp twofish_xts_crypt_16way;
  491. ENDPROC(twofish_xts_enc_16way)
  492. ENTRY(twofish_xts_dec_16way)
  493. /* input:
  494. * %rdi: ctx, CTX
  495. * %rsi: dst (16 blocks)
  496. * %rdx: src (16 blocks)
  497. * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
  498. */
  499. leaq __twofish_dec_blk16, %r8;
  500. jmp twofish_xts_crypt_16way;
  501. ENDPROC(twofish_xts_dec_16way)