cast6-avx-x86_64-asm_64.S 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. /*
  2. * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
  3. *
  4. * Copyright (C) 2012 Johannes Goetzfried
  5. * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  20. * USA
  21. *
  22. */
  23. .file "cast6-avx-x86_64-asm_64.S"
  24. .text
  25. .extern cast6_s1
  26. .extern cast6_s2
  27. .extern cast6_s3
  28. .extern cast6_s4
  29. /* structure of crypto context */
  30. #define km 0
  31. #define kr (12*4*4)
  32. /* s-boxes */
  33. #define s1 cast6_s1
  34. #define s2 cast6_s2
  35. #define s3 cast6_s3
  36. #define s4 cast6_s4
  37. /**********************************************************************
  38. 8-way AVX cast6
  39. **********************************************************************/
  40. #define CTX %rdi
  41. #define RA1 %xmm0
  42. #define RB1 %xmm1
  43. #define RC1 %xmm2
  44. #define RD1 %xmm3
  45. #define RA2 %xmm4
  46. #define RB2 %xmm5
  47. #define RC2 %xmm6
  48. #define RD2 %xmm7
  49. #define RX %xmm8
  50. #define RKM %xmm9
  51. #define RKRF %xmm10
  52. #define RKRR %xmm11
  53. #define RTMP %xmm12
  54. #define RMASK %xmm13
  55. #define R32 %xmm14
  56. #define RID1 %rax
  57. #define RID1b %al
  58. #define RID2 %rbx
  59. #define RID2b %bl
  60. #define RGI1 %rdx
  61. #define RGI1bl %dl
  62. #define RGI1bh %dh
  63. #define RGI2 %rcx
  64. #define RGI2bl %cl
  65. #define RGI2bh %ch
  66. #define RFS1 %r8
  67. #define RFS1d %r8d
  68. #define RFS2 %r9
  69. #define RFS2d %r9d
  70. #define RFS3 %r10
  71. #define RFS3d %r10d
  72. #define lookup_32bit(src, dst, op1, op2, op3) \
  73. movb src ## bl, RID1b; \
  74. movb src ## bh, RID2b; \
  75. movl s1(, RID1, 4), dst ## d; \
  76. op1 s2(, RID2, 4), dst ## d; \
  77. shrq $16, src; \
  78. movb src ## bl, RID1b; \
  79. movb src ## bh, RID2b; \
  80. op2 s3(, RID1, 4), dst ## d; \
  81. op3 s4(, RID2, 4), dst ## d;
  82. #define F(a, x, op0, op1, op2, op3) \
  83. op0 a, RKM, x; \
  84. vpslld RKRF, x, RTMP; \
  85. vpsrld RKRR, x, x; \
  86. vpor RTMP, x, x; \
  87. \
  88. vpshufb RMASK, x, x; \
  89. vmovq x, RGI1; \
  90. vpsrldq $8, x, x; \
  91. vmovq x, RGI2; \
  92. \
  93. lookup_32bit(RGI1, RFS1, op1, op2, op3); \
  94. shrq $16, RGI1; \
  95. lookup_32bit(RGI1, RFS2, op1, op2, op3); \
  96. shlq $32, RFS2; \
  97. orq RFS1, RFS2; \
  98. \
  99. lookup_32bit(RGI2, RFS1, op1, op2, op3); \
  100. shrq $16, RGI2; \
  101. lookup_32bit(RGI2, RFS3, op1, op2, op3); \
  102. shlq $32, RFS3; \
  103. orq RFS1, RFS3; \
  104. \
  105. vmovq RFS2, x; \
  106. vpinsrq $1, RFS3, x, x;
  107. #define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
  108. #define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
  109. #define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
  110. #define qop(in, out, x, f) \
  111. F ## f(in ## 1, x); \
  112. vpxor out ## 1, x, out ## 1; \
  113. F ## f(in ## 2, x); \
  114. vpxor out ## 2, x, out ## 2; \
  115. #define Q(n) \
  116. vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
  117. vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
  118. vpsubq RKRF, R32, RKRR; \
  119. qop(RD, RC, RX, 1); \
  120. \
  121. vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
  122. vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
  123. vpsubq RKRF, R32, RKRR; \
  124. qop(RC, RB, RX, 2); \
  125. \
  126. vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
  127. vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
  128. vpsubq RKRF, R32, RKRR; \
  129. qop(RB, RA, RX, 3); \
  130. \
  131. vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
  132. vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
  133. vpsubq RKRF, R32, RKRR; \
  134. qop(RA, RD, RX, 1);
  135. #define QBAR(n) \
  136. vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
  137. vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
  138. vpsubq RKRF, R32, RKRR; \
  139. qop(RA, RD, RX, 1); \
  140. \
  141. vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
  142. vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
  143. vpsubq RKRF, R32, RKRR; \
  144. qop(RB, RA, RX, 3); \
  145. \
  146. vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
  147. vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
  148. vpsubq RKRF, R32, RKRR; \
  149. qop(RC, RB, RX, 2); \
  150. \
  151. vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
  152. vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
  153. vpsubq RKRF, R32, RKRR; \
  154. qop(RD, RC, RX, 1);
  155. #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  156. vpunpckldq x1, x0, t0; \
  157. vpunpckhdq x1, x0, t2; \
  158. vpunpckldq x3, x2, t1; \
  159. vpunpckhdq x3, x2, x3; \
  160. \
  161. vpunpcklqdq t1, t0, x0; \
  162. vpunpckhqdq t1, t0, x1; \
  163. vpunpcklqdq x3, t2, x2; \
  164. vpunpckhqdq x3, t2, x3;
  165. #define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
  166. vmovdqu (0*4*4)(in), x0; \
  167. vmovdqu (1*4*4)(in), x1; \
  168. vmovdqu (2*4*4)(in), x2; \
  169. vmovdqu (3*4*4)(in), x3; \
  170. vpshufb RMASK, x0, x0; \
  171. vpshufb RMASK, x1, x1; \
  172. vpshufb RMASK, x2, x2; \
  173. vpshufb RMASK, x3, x3; \
  174. \
  175. transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  176. #define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  177. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  178. \
  179. vpshufb RMASK, x0, x0; \
  180. vpshufb RMASK, x1, x1; \
  181. vpshufb RMASK, x2, x2; \
  182. vpshufb RMASK, x3, x3; \
  183. vmovdqu x0, (0*4*4)(out); \
  184. vmovdqu x1, (1*4*4)(out); \
  185. vmovdqu x2, (2*4*4)(out); \
  186. vmovdqu x3, (3*4*4)(out);
  187. #define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  188. transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  189. \
  190. vpshufb RMASK, x0, x0; \
  191. vpshufb RMASK, x1, x1; \
  192. vpshufb RMASK, x2, x2; \
  193. vpshufb RMASK, x3, x3; \
  194. vpxor (0*4*4)(out), x0, x0; \
  195. vmovdqu x0, (0*4*4)(out); \
  196. vpxor (1*4*4)(out), x1, x1; \
  197. vmovdqu x1, (1*4*4)(out); \
  198. vpxor (2*4*4)(out), x2, x2; \
  199. vmovdqu x2, (2*4*4)(out); \
  200. vpxor (3*4*4)(out), x3, x3; \
  201. vmovdqu x3, (3*4*4)(out);
  202. .align 16
  203. .Lbswap_mask:
  204. .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  205. .L32_mask:
  206. .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
  207. .align 16
  208. .global __cast6_enc_blk_8way
  209. .type __cast6_enc_blk_8way,@function;
  210. __cast6_enc_blk_8way:
  211. /* input:
  212. * %rdi: ctx, CTX
  213. * %rsi: dst
  214. * %rdx: src
  215. * %rcx: bool, if true: xor output
  216. */
  217. pushq %rbx;
  218. pushq %rcx;
  219. vmovdqu .Lbswap_mask, RMASK;
  220. vmovdqu .L32_mask, R32;
  221. vpxor RKRF, RKRF, RKRF;
  222. leaq (4*4*4)(%rdx), %rax;
  223. inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
  224. inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
  225. xorq RID1, RID1;
  226. xorq RID2, RID2;
  227. Q(0);
  228. Q(1);
  229. Q(2);
  230. Q(3);
  231. Q(4);
  232. Q(5);
  233. QBAR(6);
  234. QBAR(7);
  235. QBAR(8);
  236. QBAR(9);
  237. QBAR(10);
  238. QBAR(11);
  239. popq %rcx;
  240. popq %rbx;
  241. leaq (4*4*4)(%rsi), %rax;
  242. testb %cl, %cl;
  243. jnz __enc_xor8;
  244. outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
  245. outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
  246. ret;
  247. __enc_xor8:
  248. outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
  249. outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
  250. ret;
  251. .align 16
  252. .global cast6_dec_blk_8way
  253. .type cast6_dec_blk_8way,@function;
  254. cast6_dec_blk_8way:
  255. /* input:
  256. * %rdi: ctx, CTX
  257. * %rsi: dst
  258. * %rdx: src
  259. */
  260. pushq %rbx;
  261. vmovdqu .Lbswap_mask, RMASK;
  262. vmovdqu .L32_mask, R32;
  263. vpxor RKRF, RKRF, RKRF;
  264. leaq (4*4*4)(%rdx), %rax;
  265. inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
  266. inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
  267. xorq RID1, RID1;
  268. xorq RID2, RID2;
  269. Q(11);
  270. Q(10);
  271. Q(9);
  272. Q(8);
  273. Q(7);
  274. Q(6);
  275. QBAR(5);
  276. QBAR(4);
  277. QBAR(3);
  278. QBAR(2);
  279. QBAR(1);
  280. QBAR(0);
  281. popq %rbx;
  282. leaq (4*4*4)(%rsi), %rax;
  283. outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
  284. outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
  285. ret;