twofish-avx-x86_64-asm_64.S 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. /*
  2. * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
  3. *
  4. * Copyright (C) 2012 Johannes Goetzfried
  5. * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  6. *
  7. * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  22. * USA
  23. *
  24. */
  25. .file "twofish-avx-x86_64-asm_64.S"
  26. .text
  27. /* structure of crypto context */
  28. #define s0 0
  29. #define s1 1024
  30. #define s2 2048
  31. #define s3 3072
  32. #define w 4096
  33. #define k 4128
  34. /**********************************************************************
  35. 8-way AVX twofish
  36. **********************************************************************/
  37. #define CTX %rdi
  38. #define RA1 %xmm0
  39. #define RB1 %xmm1
  40. #define RC1 %xmm2
  41. #define RD1 %xmm3
  42. #define RA2 %xmm4
  43. #define RB2 %xmm5
  44. #define RC2 %xmm6
  45. #define RD2 %xmm7
  46. #define RX0 %xmm8
  47. #define RY0 %xmm9
  48. #define RX1 %xmm10
  49. #define RY1 %xmm11
  50. #define RK1 %xmm12
  51. #define RK2 %xmm13
  52. #define RT %xmm14
  53. #define RR %xmm15
  54. #define RID1 %rbp
  55. #define RID1d %ebp
  56. #define RID2 %rsi
  57. #define RID2d %esi
  58. #define RGI1 %rdx
  59. #define RGI1bl %dl
  60. #define RGI1bh %dh
  61. #define RGI2 %rcx
  62. #define RGI2bl %cl
  63. #define RGI2bh %ch
  64. #define RGI3 %rax
  65. #define RGI3bl %al
  66. #define RGI3bh %ah
  67. #define RGI4 %rbx
  68. #define RGI4bl %bl
  69. #define RGI4bh %bh
  70. #define RGS1 %r8
  71. #define RGS1d %r8d
  72. #define RGS2 %r9
  73. #define RGS2d %r9d
  74. #define RGS3 %r10
  75. #define RGS3d %r10d
  76. #define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
  77. movzbl src ## bl, RID1d; \
  78. movzbl src ## bh, RID2d; \
  79. shrq $16, src; \
  80. movl t0(CTX, RID1, 4), dst ## d; \
  81. movl t1(CTX, RID2, 4), RID2d; \
  82. movzbl src ## bl, RID1d; \
  83. xorl RID2d, dst ## d; \
  84. movzbl src ## bh, RID2d; \
  85. interleave_op(il_reg); \
  86. xorl t2(CTX, RID1, 4), dst ## d; \
  87. xorl t3(CTX, RID2, 4), dst ## d;
  88. #define dummy(d) /* do nothing */
  89. #define shr_next(reg) \
  90. shrq $16, reg;
  91. #define G(gi1, gi2, x, t0, t1, t2, t3) \
  92. lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
  93. lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
  94. \
  95. lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
  96. shlq $32, RGS2; \
  97. orq RGS1, RGS2; \
  98. lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
  99. shlq $32, RGS1; \
  100. orq RGS1, RGS3;
  101. #define round_head_2(a, b, x1, y1, x2, y2) \
  102. vmovq b ## 1, RGI3; \
  103. vpextrq $1, b ## 1, RGI4; \
  104. \
  105. G(RGI1, RGI2, x1, s0, s1, s2, s3); \
  106. vmovq a ## 2, RGI1; \
  107. vpextrq $1, a ## 2, RGI2; \
  108. vmovq RGS2, x1; \
  109. vpinsrq $1, RGS3, x1, x1; \
  110. \
  111. G(RGI3, RGI4, y1, s1, s2, s3, s0); \
  112. vmovq b ## 2, RGI3; \
  113. vpextrq $1, b ## 2, RGI4; \
  114. vmovq RGS2, y1; \
  115. vpinsrq $1, RGS3, y1, y1; \
  116. \
  117. G(RGI1, RGI2, x2, s0, s1, s2, s3); \
  118. vmovq RGS2, x2; \
  119. vpinsrq $1, RGS3, x2, x2; \
  120. \
  121. G(RGI3, RGI4, y2, s1, s2, s3, s0); \
  122. vmovq RGS2, y2; \
  123. vpinsrq $1, RGS3, y2, y2;
  124. #define encround_tail(a, b, c, d, x, y, prerotate) \
  125. vpaddd x, y, x; \
  126. vpaddd x, RK1, RT;\
  127. prerotate(b); \
  128. vpxor RT, c, c; \
  129. vpaddd y, x, y; \
  130. vpaddd y, RK2, y; \
  131. vpsrld $1, c, RT; \
  132. vpslld $(32 - 1), c, c; \
  133. vpor c, RT, c; \
  134. vpxor d, y, d; \
  135. #define decround_tail(a, b, c, d, x, y, prerotate) \
  136. vpaddd x, y, x; \
  137. vpaddd x, RK1, RT;\
  138. prerotate(a); \
  139. vpxor RT, c, c; \
  140. vpaddd y, x, y; \
  141. vpaddd y, RK2, y; \
  142. vpxor d, y, d; \
  143. vpsrld $1, d, y; \
  144. vpslld $(32 - 1), d, d; \
  145. vpor d, y, d; \
  146. #define rotate_1l(x) \
  147. vpslld $1, x, RR; \
  148. vpsrld $(32 - 1), x, x; \
  149. vpor x, RR, x;
  150. #define preload_rgi(c) \
  151. vmovq c, RGI1; \
  152. vpextrq $1, c, RGI2;
  153. #define encrypt_round(n, a, b, c, d, preload, prerotate) \
  154. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  155. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  156. round_head_2(a, b, RX0, RY0, RX1, RY1); \
  157. encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
  158. preload(c ## 1); \
  159. encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
  160. #define decrypt_round(n, a, b, c, d, preload, prerotate) \
  161. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  162. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  163. round_head_2(a, b, RX0, RY0, RX1, RY1); \
  164. decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
  165. preload(c ## 1); \
  166. decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
  167. #define encrypt_cycle(n) \
  168. encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
  169. encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
  170. #define encrypt_cycle_last(n) \
  171. encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
  172. encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
  173. #define decrypt_cycle(n) \
  174. decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
  175. decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
  176. #define decrypt_cycle_last(n) \
  177. decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
  178. decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
  179. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  180. vpunpckldq x1, x0, t0; \
  181. vpunpckhdq x1, x0, t2; \
  182. vpunpckldq x3, x2, t1; \
  183. vpunpckhdq x3, x2, x3; \
  184. \
  185. vpunpcklqdq t1, t0, x0; \
  186. vpunpckhqdq t1, t0, x1; \
  187. vpunpcklqdq x3, t2, x2; \
  188. vpunpckhqdq x3, t2, x3;
  189. #define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
  190. vpxor (0*4*4)(in), wkey, x0; \
  191. vpxor (1*4*4)(in), wkey, x1; \
  192. vpxor (2*4*4)(in), wkey, x2; \
  193. vpxor (3*4*4)(in), wkey, x3; \
  194. \
  195. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  196. #define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
  197. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  198. \
  199. vpxor x0, wkey, x0; \
  200. vmovdqu x0, (0*4*4)(out); \
  201. vpxor x1, wkey, x1; \
  202. vmovdqu x1, (1*4*4)(out); \
  203. vpxor x2, wkey, x2; \
  204. vmovdqu x2, (2*4*4)(out); \
  205. vpxor x3, wkey, x3; \
  206. vmovdqu x3, (3*4*4)(out);
  207. #define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
  208. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  209. \
  210. vpxor x0, wkey, x0; \
  211. vpxor (0*4*4)(out), x0, x0; \
  212. vmovdqu x0, (0*4*4)(out); \
  213. vpxor x1, wkey, x1; \
  214. vpxor (1*4*4)(out), x1, x1; \
  215. vmovdqu x1, (1*4*4)(out); \
  216. vpxor x2, wkey, x2; \
  217. vpxor (2*4*4)(out), x2, x2; \
  218. vmovdqu x2, (2*4*4)(out); \
  219. vpxor x3, wkey, x3; \
  220. vpxor (3*4*4)(out), x3, x3; \
  221. vmovdqu x3, (3*4*4)(out);
  222. .align 8
  223. .global __twofish_enc_blk_8way
  224. .type __twofish_enc_blk_8way,@function;
  225. __twofish_enc_blk_8way:
  226. /* input:
  227. * %rdi: ctx, CTX
  228. * %rsi: dst
  229. * %rdx: src
  230. * %rcx: bool, if true: xor output
  231. */
  232. pushq %rbp;
  233. pushq %rbx;
  234. pushq %rcx;
  235. vmovdqu w(CTX), RK1;
  236. leaq (4*4*4)(%rdx), %rax;
  237. inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
  238. preload_rgi(RA1);
  239. rotate_1l(RD1);
  240. inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
  241. rotate_1l(RD2);
  242. movq %rsi, %r11;
  243. encrypt_cycle(0);
  244. encrypt_cycle(1);
  245. encrypt_cycle(2);
  246. encrypt_cycle(3);
  247. encrypt_cycle(4);
  248. encrypt_cycle(5);
  249. encrypt_cycle(6);
  250. encrypt_cycle_last(7);
  251. vmovdqu (w+4*4)(CTX), RK1;
  252. popq %rcx;
  253. popq %rbx;
  254. popq %rbp;
  255. leaq (4*4*4)(%r11), %rax;
  256. testb %cl, %cl;
  257. jnz __enc_xor8;
  258. outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  259. outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  260. ret;
  261. __enc_xor8:
  262. outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  263. outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  264. ret;
  265. .align 8
  266. .global twofish_dec_blk_8way
  267. .type twofish_dec_blk_8way,@function;
  268. twofish_dec_blk_8way:
  269. /* input:
  270. * %rdi: ctx, CTX
  271. * %rsi: dst
  272. * %rdx: src
  273. */
  274. pushq %rbp;
  275. pushq %rbx;
  276. vmovdqu (w+4*4)(CTX), RK1;
  277. leaq (4*4*4)(%rdx), %rax;
  278. inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
  279. preload_rgi(RC1);
  280. rotate_1l(RA1);
  281. inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  282. rotate_1l(RA2);
  283. movq %rsi, %r11;
  284. decrypt_cycle(7);
  285. decrypt_cycle(6);
  286. decrypt_cycle(5);
  287. decrypt_cycle(4);
  288. decrypt_cycle(3);
  289. decrypt_cycle(2);
  290. decrypt_cycle(1);
  291. decrypt_cycle_last(0);
  292. vmovdqu (w)(CTX), RK1;
  293. popq %rbx;
  294. popq %rbp;
  295. leaq (4*4*4)(%r11), %rax;
  296. outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
  297. outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
  298. ret;