twofish-avx-x86_64-asm_64.S 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. /*
  2. * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
  3. *
  4. * Copyright (C) 2012 Johannes Goetzfried
  5. * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  20. * USA
  21. *
  22. */
  23. .file "twofish-avx-x86_64-asm_64.S"
  24. .text
  25. /* structure of crypto context */
  26. #define s0 0
  27. #define s1 1024
  28. #define s2 2048
  29. #define s3 3072
  30. #define w 4096
  31. #define k 4128
  32. /**********************************************************************
  33. 8-way AVX twofish
  34. **********************************************************************/
  35. #define CTX %rdi
  36. #define RA1 %xmm0
  37. #define RB1 %xmm1
  38. #define RC1 %xmm2
  39. #define RD1 %xmm3
  40. #define RA2 %xmm4
  41. #define RB2 %xmm5
  42. #define RC2 %xmm6
  43. #define RD2 %xmm7
  44. #define RX %xmm8
  45. #define RY %xmm9
  46. #define RK1 %xmm10
  47. #define RK2 %xmm11
  48. #define RID1 %rax
  49. #define RID1b %al
  50. #define RID2 %rbx
  51. #define RID2b %bl
  52. #define RGI1 %rdx
  53. #define RGI1bl %dl
  54. #define RGI1bh %dh
  55. #define RGI2 %rcx
  56. #define RGI2bl %cl
  57. #define RGI2bh %ch
  58. #define RGS1 %r8
  59. #define RGS1d %r8d
  60. #define RGS2 %r9
  61. #define RGS2d %r9d
  62. #define RGS3 %r10
  63. #define RGS3d %r10d
  64. #define lookup_32bit(t0, t1, t2, t3, src, dst) \
  65. movb src ## bl, RID1b; \
  66. movb src ## bh, RID2b; \
  67. movl t0(CTX, RID1, 4), dst ## d; \
  68. xorl t1(CTX, RID2, 4), dst ## d; \
  69. shrq $16, src; \
  70. movb src ## bl, RID1b; \
  71. movb src ## bh, RID2b; \
  72. xorl t2(CTX, RID1, 4), dst ## d; \
  73. xorl t3(CTX, RID2, 4), dst ## d;
  74. #define G(a, x, t0, t1, t2, t3) \
  75. vmovq a, RGI1; \
  76. vpsrldq $8, a, x; \
  77. vmovq x, RGI2; \
  78. \
  79. lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
  80. shrq $16, RGI1; \
  81. lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
  82. shlq $32, RGS2; \
  83. orq RGS1, RGS2; \
  84. \
  85. lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
  86. shrq $16, RGI2; \
  87. lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
  88. shlq $32, RGS3; \
  89. orq RGS1, RGS3; \
  90. \
  91. vmovq RGS2, x; \
  92. vpinsrq $1, RGS3, x, x;
  93. #define encround(a, b, c, d, x, y) \
  94. G(a, x, s0, s1, s2, s3); \
  95. G(b, y, s1, s2, s3, s0); \
  96. vpaddd x, y, x; \
  97. vpaddd y, x, y; \
  98. vpaddd x, RK1, x; \
  99. vpaddd y, RK2, y; \
  100. vpxor x, c, c; \
  101. vpsrld $1, c, x; \
  102. vpslld $(32 - 1), c, c; \
  103. vpor c, x, c; \
  104. vpslld $1, d, x; \
  105. vpsrld $(32 - 1), d, d; \
  106. vpor d, x, d; \
  107. vpxor d, y, d;
  108. #define decround(a, b, c, d, x, y) \
  109. G(a, x, s0, s1, s2, s3); \
  110. G(b, y, s1, s2, s3, s0); \
  111. vpaddd x, y, x; \
  112. vpaddd y, x, y; \
  113. vpaddd y, RK2, y; \
  114. vpxor d, y, d; \
  115. vpsrld $1, d, y; \
  116. vpslld $(32 - 1), d, d; \
  117. vpor d, y, d; \
  118. vpslld $1, c, y; \
  119. vpsrld $(32 - 1), c, c; \
  120. vpor c, y, c; \
  121. vpaddd x, RK1, x; \
  122. vpxor x, c, c;
  123. #define encrypt_round(n, a, b, c, d) \
  124. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  125. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  126. encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
  127. encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
  128. #define decrypt_round(n, a, b, c, d) \
  129. vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
  130. vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
  131. decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
  132. decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
  133. #define encrypt_cycle(n) \
  134. encrypt_round((2*n), RA, RB, RC, RD); \
  135. encrypt_round(((2*n) + 1), RC, RD, RA, RB);
  136. #define decrypt_cycle(n) \
  137. decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
  138. decrypt_round((2*n), RA, RB, RC, RD);
  139. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  140. vpunpckldq x1, x0, t0; \
  141. vpunpckhdq x1, x0, t2; \
  142. vpunpckldq x3, x2, t1; \
  143. vpunpckhdq x3, x2, x3; \
  144. \
  145. vpunpcklqdq t1, t0, x0; \
  146. vpunpckhqdq t1, t0, x1; \
  147. vpunpcklqdq x3, t2, x2; \
  148. vpunpckhqdq x3, t2, x3;
  149. #define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
  150. vpxor (0*4*4)(in), wkey, x0; \
  151. vpxor (1*4*4)(in), wkey, x1; \
  152. vpxor (2*4*4)(in), wkey, x2; \
  153. vpxor (3*4*4)(in), wkey, x3; \
  154. \
  155. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  156. #define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
  157. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  158. \
  159. vpxor x0, wkey, x0; \
  160. vmovdqu x0, (0*4*4)(out); \
  161. vpxor x1, wkey, x1; \
  162. vmovdqu x1, (1*4*4)(out); \
  163. vpxor x2, wkey, x2; \
  164. vmovdqu x2, (2*4*4)(out); \
  165. vpxor x3, wkey, x3; \
  166. vmovdqu x3, (3*4*4)(out);
  167. #define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
  168. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  169. \
  170. vpxor x0, wkey, x0; \
  171. vpxor (0*4*4)(out), x0, x0; \
  172. vmovdqu x0, (0*4*4)(out); \
  173. vpxor x1, wkey, x1; \
  174. vpxor (1*4*4)(out), x1, x1; \
  175. vmovdqu x1, (1*4*4)(out); \
  176. vpxor x2, wkey, x2; \
  177. vpxor (2*4*4)(out), x2, x2; \
  178. vmovdqu x2, (2*4*4)(out); \
  179. vpxor x3, wkey, x3; \
  180. vpxor (3*4*4)(out), x3, x3; \
  181. vmovdqu x3, (3*4*4)(out);
  182. .align 8
  183. .global __twofish_enc_blk_8way
  184. .type __twofish_enc_blk_8way,@function;
  185. __twofish_enc_blk_8way:
  186. /* input:
  187. * %rdi: ctx, CTX
  188. * %rsi: dst
  189. * %rdx: src
  190. * %rcx: bool, if true: xor output
  191. */
  192. pushq %rbx;
  193. pushq %rcx;
  194. vmovdqu w(CTX), RK1;
  195. leaq (4*4*4)(%rdx), %rax;
  196. inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
  197. inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
  198. xorq RID1, RID1;
  199. xorq RID2, RID2;
  200. encrypt_cycle(0);
  201. encrypt_cycle(1);
  202. encrypt_cycle(2);
  203. encrypt_cycle(3);
  204. encrypt_cycle(4);
  205. encrypt_cycle(5);
  206. encrypt_cycle(6);
  207. encrypt_cycle(7);
  208. vmovdqu (w+4*4)(CTX), RK1;
  209. popq %rcx;
  210. popq %rbx;
  211. leaq (4*4*4)(%rsi), %rax;
  212. testb %cl, %cl;
  213. jnz __enc_xor8;
  214. outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
  215. outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
  216. ret;
  217. __enc_xor8:
  218. outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
  219. outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
  220. ret;
  221. .align 8
  222. .global twofish_dec_blk_8way
  223. .type twofish_dec_blk_8way,@function;
  224. twofish_dec_blk_8way:
  225. /* input:
  226. * %rdi: ctx, CTX
  227. * %rsi: dst
  228. * %rdx: src
  229. */
  230. pushq %rbx;
  231. vmovdqu (w+4*4)(CTX), RK1;
  232. leaq (4*4*4)(%rdx), %rax;
  233. inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
  234. inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
  235. xorq RID1, RID1;
  236. xorq RID2, RID2;
  237. decrypt_cycle(7);
  238. decrypt_cycle(6);
  239. decrypt_cycle(5);
  240. decrypt_cycle(4);
  241. decrypt_cycle(3);
  242. decrypt_cycle(2);
  243. decrypt_cycle(1);
  244. decrypt_cycle(0);
  245. vmovdqu (w)(CTX), RK1;
  246. popq %rbx;
  247. leaq (4*4*4)(%rsi), %rax;
  248. outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
  249. outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
  250. ret;