blowfish-avx2-asm_64.S 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. /*
  2. * x86_64/AVX2 assembler optimized version of Blowfish
  3. *
  4. * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. */
  12. #include <linux/linkage.h>
  13. .file "blowfish-avx2-asm_64.S"
  14. .data
  15. .align 32
  16. .Lprefetch_mask:
  17. .long 0*64
  18. .long 1*64
  19. .long 2*64
  20. .long 3*64
  21. .long 4*64
  22. .long 5*64
  23. .long 6*64
  24. .long 7*64
  25. .Lbswap32_mask:
  26. .long 0x00010203
  27. .long 0x04050607
  28. .long 0x08090a0b
  29. .long 0x0c0d0e0f
  30. .Lbswap128_mask:
  31. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  32. .Lbswap_iv_mask:
  33. .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
  34. .text
  35. /* structure of crypto context */
  36. #define p 0
  37. #define s0 ((16 + 2) * 4)
  38. #define s1 ((16 + 2 + (1 * 256)) * 4)
  39. #define s2 ((16 + 2 + (2 * 256)) * 4)
  40. #define s3 ((16 + 2 + (3 * 256)) * 4)
  41. /* register macros */
  42. #define CTX %rdi
  43. #define RIO %rdx
  44. #define RS0 %rax
  45. #define RS1 %r8
  46. #define RS2 %r9
  47. #define RS3 %r10
  48. #define RLOOP %r11
  49. #define RLOOPd %r11d
  50. #define RXr0 %ymm8
  51. #define RXr1 %ymm9
  52. #define RXr2 %ymm10
  53. #define RXr3 %ymm11
  54. #define RXl0 %ymm12
  55. #define RXl1 %ymm13
  56. #define RXl2 %ymm14
  57. #define RXl3 %ymm15
  58. /* temp regs */
  59. #define RT0 %ymm0
  60. #define RT0x %xmm0
  61. #define RT1 %ymm1
  62. #define RT1x %xmm1
  63. #define RIDX0 %ymm2
  64. #define RIDX1 %ymm3
  65. #define RIDX1x %xmm3
  66. #define RIDX2 %ymm4
  67. #define RIDX3 %ymm5
  68. /* vpgatherdd mask and '-1' */
  69. #define RNOT %ymm6
  70. /* byte mask, (-1 >> 24) */
  71. #define RBYTE %ymm7
  72. /***********************************************************************
  73. * 32-way AVX2 blowfish
  74. ***********************************************************************/
  75. #define F(xl, xr) \
  76. vpsrld $24, xl, RIDX0; \
  77. vpsrld $16, xl, RIDX1; \
  78. vpsrld $8, xl, RIDX2; \
  79. vpand RBYTE, RIDX1, RIDX1; \
  80. vpand RBYTE, RIDX2, RIDX2; \
  81. vpand RBYTE, xl, RIDX3; \
  82. \
  83. vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
  84. vpcmpeqd RNOT, RNOT, RNOT; \
  85. vpcmpeqd RIDX0, RIDX0, RIDX0; \
  86. \
  87. vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
  88. vpcmpeqd RIDX1, RIDX1, RIDX1; \
  89. vpaddd RT0, RT1, RT0; \
  90. \
  91. vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
  92. vpxor RT0, RT1, RT0; \
  93. \
  94. vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
  95. vpcmpeqd RNOT, RNOT, RNOT; \
  96. vpaddd RT0, RT1, RT0; \
  97. \
  98. vpxor RT0, xr, xr;
  99. #define add_roundkey(xl, nmem) \
  100. vpbroadcastd nmem, RT0; \
  101. vpxor RT0, xl ## 0, xl ## 0; \
  102. vpxor RT0, xl ## 1, xl ## 1; \
  103. vpxor RT0, xl ## 2, xl ## 2; \
  104. vpxor RT0, xl ## 3, xl ## 3;
  105. #define round_enc() \
  106. add_roundkey(RXr, p(CTX,RLOOP,4)); \
  107. F(RXl0, RXr0); \
  108. F(RXl1, RXr1); \
  109. F(RXl2, RXr2); \
  110. F(RXl3, RXr3); \
  111. \
  112. add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
  113. F(RXr0, RXl0); \
  114. F(RXr1, RXl1); \
  115. F(RXr2, RXl2); \
  116. F(RXr3, RXl3);
  117. #define round_dec() \
  118. add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
  119. F(RXl0, RXr0); \
  120. F(RXl1, RXr1); \
  121. F(RXl2, RXr2); \
  122. F(RXl3, RXr3); \
  123. \
  124. add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
  125. F(RXr0, RXl0); \
  126. F(RXr1, RXl1); \
  127. F(RXr2, RXl2); \
  128. F(RXr3, RXl3);
  129. #define init_round_constants() \
  130. vpcmpeqd RNOT, RNOT, RNOT; \
  131. leaq s0(CTX), RS0; \
  132. leaq s1(CTX), RS1; \
  133. leaq s2(CTX), RS2; \
  134. leaq s3(CTX), RS3; \
  135. vpsrld $24, RNOT, RBYTE;
  136. #define transpose_2x2(x0, x1, t0) \
  137. vpunpckldq x0, x1, t0; \
  138. vpunpckhdq x0, x1, x1; \
  139. \
  140. vpunpcklqdq t0, x1, x0; \
  141. vpunpckhqdq t0, x1, x1;
  142. #define read_block(xl, xr) \
  143. vbroadcasti128 .Lbswap32_mask, RT1; \
  144. \
  145. vpshufb RT1, xl ## 0, xl ## 0; \
  146. vpshufb RT1, xr ## 0, xr ## 0; \
  147. vpshufb RT1, xl ## 1, xl ## 1; \
  148. vpshufb RT1, xr ## 1, xr ## 1; \
  149. vpshufb RT1, xl ## 2, xl ## 2; \
  150. vpshufb RT1, xr ## 2, xr ## 2; \
  151. vpshufb RT1, xl ## 3, xl ## 3; \
  152. vpshufb RT1, xr ## 3, xr ## 3; \
  153. \
  154. transpose_2x2(xl ## 0, xr ## 0, RT0); \
  155. transpose_2x2(xl ## 1, xr ## 1, RT0); \
  156. transpose_2x2(xl ## 2, xr ## 2, RT0); \
  157. transpose_2x2(xl ## 3, xr ## 3, RT0);
  158. #define write_block(xl, xr) \
  159. vbroadcasti128 .Lbswap32_mask, RT1; \
  160. \
  161. transpose_2x2(xl ## 0, xr ## 0, RT0); \
  162. transpose_2x2(xl ## 1, xr ## 1, RT0); \
  163. transpose_2x2(xl ## 2, xr ## 2, RT0); \
  164. transpose_2x2(xl ## 3, xr ## 3, RT0); \
  165. \
  166. vpshufb RT1, xl ## 0, xl ## 0; \
  167. vpshufb RT1, xr ## 0, xr ## 0; \
  168. vpshufb RT1, xl ## 1, xl ## 1; \
  169. vpshufb RT1, xr ## 1, xr ## 1; \
  170. vpshufb RT1, xl ## 2, xl ## 2; \
  171. vpshufb RT1, xr ## 2, xr ## 2; \
  172. vpshufb RT1, xl ## 3, xl ## 3; \
  173. vpshufb RT1, xr ## 3, xr ## 3;
  174. .align 8
  175. __blowfish_enc_blk32:
  176. /* input:
  177. * %rdi: ctx, CTX
  178. * RXl0..4, RXr0..4: plaintext
  179. * output:
  180. * RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
  181. */
  182. init_round_constants();
  183. read_block(RXl, RXr);
  184. movl $1, RLOOPd;
  185. add_roundkey(RXl, p+4*(0)(CTX));
  186. .align 4
  187. .L__enc_loop:
  188. round_enc();
  189. leal 2(RLOOPd), RLOOPd;
  190. cmpl $17, RLOOPd;
  191. jne .L__enc_loop;
  192. add_roundkey(RXr, p+4*(17)(CTX));
  193. write_block(RXl, RXr);
  194. ret;
  195. ENDPROC(__blowfish_enc_blk32)
  196. .align 8
  197. __blowfish_dec_blk32:
  198. /* input:
  199. * %rdi: ctx, CTX
  200. * RXl0..4, RXr0..4: ciphertext
  201. * output:
  202. * RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
  203. */
  204. init_round_constants();
  205. read_block(RXl, RXr);
  206. movl $14, RLOOPd;
  207. add_roundkey(RXl, p+4*(17)(CTX));
  208. .align 4
  209. .L__dec_loop:
  210. round_dec();
  211. addl $-2, RLOOPd;
  212. jns .L__dec_loop;
  213. add_roundkey(RXr, p+4*(0)(CTX));
  214. write_block(RXl, RXr);
  215. ret;
  216. ENDPROC(__blowfish_dec_blk32)
  217. ENTRY(blowfish_ecb_enc_32way)
  218. /* input:
  219. * %rdi: ctx, CTX
  220. * %rsi: dst
  221. * %rdx: src
  222. */
  223. vzeroupper;
  224. vmovdqu 0*32(%rdx), RXl0;
  225. vmovdqu 1*32(%rdx), RXr0;
  226. vmovdqu 2*32(%rdx), RXl1;
  227. vmovdqu 3*32(%rdx), RXr1;
  228. vmovdqu 4*32(%rdx), RXl2;
  229. vmovdqu 5*32(%rdx), RXr2;
  230. vmovdqu 6*32(%rdx), RXl3;
  231. vmovdqu 7*32(%rdx), RXr3;
  232. call __blowfish_enc_blk32;
  233. vmovdqu RXr0, 0*32(%rsi);
  234. vmovdqu RXl0, 1*32(%rsi);
  235. vmovdqu RXr1, 2*32(%rsi);
  236. vmovdqu RXl1, 3*32(%rsi);
  237. vmovdqu RXr2, 4*32(%rsi);
  238. vmovdqu RXl2, 5*32(%rsi);
  239. vmovdqu RXr3, 6*32(%rsi);
  240. vmovdqu RXl3, 7*32(%rsi);
  241. vzeroupper;
  242. ret;
  243. ENDPROC(blowfish_ecb_enc_32way)
  244. ENTRY(blowfish_ecb_dec_32way)
  245. /* input:
  246. * %rdi: ctx, CTX
  247. * %rsi: dst
  248. * %rdx: src
  249. */
  250. vzeroupper;
  251. vmovdqu 0*32(%rdx), RXl0;
  252. vmovdqu 1*32(%rdx), RXr0;
  253. vmovdqu 2*32(%rdx), RXl1;
  254. vmovdqu 3*32(%rdx), RXr1;
  255. vmovdqu 4*32(%rdx), RXl2;
  256. vmovdqu 5*32(%rdx), RXr2;
  257. vmovdqu 6*32(%rdx), RXl3;
  258. vmovdqu 7*32(%rdx), RXr3;
  259. call __blowfish_dec_blk32;
  260. vmovdqu RXr0, 0*32(%rsi);
  261. vmovdqu RXl0, 1*32(%rsi);
  262. vmovdqu RXr1, 2*32(%rsi);
  263. vmovdqu RXl1, 3*32(%rsi);
  264. vmovdqu RXr2, 4*32(%rsi);
  265. vmovdqu RXl2, 5*32(%rsi);
  266. vmovdqu RXr3, 6*32(%rsi);
  267. vmovdqu RXl3, 7*32(%rsi);
  268. vzeroupper;
  269. ret;
  270. ENDPROC(blowfish_ecb_dec_32way)
  271. ENTRY(blowfish_cbc_dec_32way)
  272. /* input:
  273. * %rdi: ctx, CTX
  274. * %rsi: dst
  275. * %rdx: src
  276. */
  277. vzeroupper;
  278. vmovdqu 0*32(%rdx), RXl0;
  279. vmovdqu 1*32(%rdx), RXr0;
  280. vmovdqu 2*32(%rdx), RXl1;
  281. vmovdqu 3*32(%rdx), RXr1;
  282. vmovdqu 4*32(%rdx), RXl2;
  283. vmovdqu 5*32(%rdx), RXr2;
  284. vmovdqu 6*32(%rdx), RXl3;
  285. vmovdqu 7*32(%rdx), RXr3;
  286. call __blowfish_dec_blk32;
  287. /* xor with src */
  288. vmovq (%rdx), RT0x;
  289. vpshufd $0x4f, RT0x, RT0x;
  290. vinserti128 $1, 8(%rdx), RT0, RT0;
  291. vpxor RT0, RXr0, RXr0;
  292. vpxor 0*32+24(%rdx), RXl0, RXl0;
  293. vpxor 1*32+24(%rdx), RXr1, RXr1;
  294. vpxor 2*32+24(%rdx), RXl1, RXl1;
  295. vpxor 3*32+24(%rdx), RXr2, RXr2;
  296. vpxor 4*32+24(%rdx), RXl2, RXl2;
  297. vpxor 5*32+24(%rdx), RXr3, RXr3;
  298. vpxor 6*32+24(%rdx), RXl3, RXl3;
  299. vmovdqu RXr0, (0*32)(%rsi);
  300. vmovdqu RXl0, (1*32)(%rsi);
  301. vmovdqu RXr1, (2*32)(%rsi);
  302. vmovdqu RXl1, (3*32)(%rsi);
  303. vmovdqu RXr2, (4*32)(%rsi);
  304. vmovdqu RXl2, (5*32)(%rsi);
  305. vmovdqu RXr3, (6*32)(%rsi);
  306. vmovdqu RXl3, (7*32)(%rsi);
  307. vzeroupper;
  308. ret;
  309. ENDPROC(blowfish_cbc_dec_32way)
  310. ENTRY(blowfish_ctr_32way)
  311. /* input:
  312. * %rdi: ctx, CTX
  313. * %rsi: dst
  314. * %rdx: src
  315. * %rcx: iv (big endian, 64bit)
  316. */
  317. vzeroupper;
  318. vpcmpeqd RT0, RT0, RT0;
  319. vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
  320. vpcmpeqd RT1x, RT1x, RT1x;
  321. vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
  322. vpxor RIDX0, RIDX0, RIDX0;
  323. vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
  324. vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
  325. vpcmpeqd RT1, RT1, RT1;
  326. vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
  327. vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
  328. vbroadcasti128 .Lbswap_iv_mask, RIDX0;
  329. vbroadcasti128 .Lbswap128_mask, RIDX1;
  330. /* load IV and byteswap */
  331. vmovq (%rcx), RT1x;
  332. vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
  333. vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
  334. /* construct IVs */
  335. vpsubq RT0, RT1, RT1; /* a: le1, b: le0, c: le3, d: le2 */
  336. vpshufb RIDX1, RT1, RXl0; /* a: be0, b: be1, c: be2, d: be3 */
  337. vpsubq RIDX2, RT1, RT1; /* le5, le4, le7, le6 */
  338. vpshufb RIDX1, RT1, RXr0; /* be4, be5, be6, be7 */
  339. vpsubq RIDX2, RT1, RT1;
  340. vpshufb RIDX1, RT1, RXl1;
  341. vpsubq RIDX2, RT1, RT1;
  342. vpshufb RIDX1, RT1, RXr1;
  343. vpsubq RIDX2, RT1, RT1;
  344. vpshufb RIDX1, RT1, RXl2;
  345. vpsubq RIDX2, RT1, RT1;
  346. vpshufb RIDX1, RT1, RXr2;
  347. vpsubq RIDX2, RT1, RT1;
  348. vpshufb RIDX1, RT1, RXl3;
  349. vpsubq RIDX2, RT1, RT1;
  350. vpshufb RIDX1, RT1, RXr3;
  351. /* store last IV */
  352. vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
  353. vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
  354. vmovq RT1x, (%rcx);
  355. call __blowfish_enc_blk32;
  356. /* dst = src ^ iv */
  357. vpxor 0*32(%rdx), RXr0, RXr0;
  358. vpxor 1*32(%rdx), RXl0, RXl0;
  359. vpxor 2*32(%rdx), RXr1, RXr1;
  360. vpxor 3*32(%rdx), RXl1, RXl1;
  361. vpxor 4*32(%rdx), RXr2, RXr2;
  362. vpxor 5*32(%rdx), RXl2, RXl2;
  363. vpxor 6*32(%rdx), RXr3, RXr3;
  364. vpxor 7*32(%rdx), RXl3, RXl3;
  365. vmovdqu RXr0, (0*32)(%rsi);
  366. vmovdqu RXl0, (1*32)(%rsi);
  367. vmovdqu RXr1, (2*32)(%rsi);
  368. vmovdqu RXl1, (3*32)(%rsi);
  369. vmovdqu RXr2, (4*32)(%rsi);
  370. vmovdqu RXl2, (5*32)(%rsi);
  371. vmovdqu RXr3, (6*32)(%rsi);
  372. vmovdqu RXl3, (7*32)(%rsi);
  373. vzeroupper;
  374. ret;
  375. ENDPROC(blowfish_ctr_32way)