|
@@ -1,7 +1,7 @@
|
|
|
/*
|
|
|
* x86_64/AVX/AES-NI assembler implementation of Camellia
|
|
|
*
|
|
|
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
|
|
|
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
|
|
|
*
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
|
|
|
.Lbswap128_mask:
|
|
|
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
|
|
|
|
|
+/* For XTS mode IV generation */
|
|
|
+.Lxts_gf128mul_and_shl1_mask:
|
|
|
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
|
|
|
+
|
|
|
/*
|
|
|
* pre-SubByte transform
|
|
|
*
|
|
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
|
|
|
|
|
|
ret;
|
|
|
ENDPROC(camellia_ctr_16way)
|
|
|
+
|
|
|
+#define gf128mul_x_ble(iv, mask, tmp) \
|
|
|
+ vpsrad $31, iv, tmp; \
|
|
|
+ vpaddq iv, iv, iv; \
|
|
|
+ vpshufd $0x13, tmp, tmp; \
|
|
|
+ vpand mask, tmp, tmp; \
|
|
|
+ vpxor tmp, iv, iv;
|
|
|
+
|
|
|
+.align 8
|
|
|
+camellia_xts_crypt_16way:
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst (16 blocks)
|
|
|
+ * %rdx: src (16 blocks)
|
|
|
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
|
|
+ * %r8: index for input whitening key
|
|
|
+ * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
|
|
|
+ */
|
|
|
+
|
|
|
+ subq $(16 * 16), %rsp;
|
|
|
+ movq %rsp, %rax;
|
|
|
+
|
|
|
+ vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
|
|
|
+
|
|
|
+ /* load IV */
|
|
|
+ vmovdqu (%rcx), %xmm0;
|
|
|
+ vpxor 0 * 16(%rdx), %xmm0, %xmm15;
|
|
|
+ vmovdqu %xmm15, 15 * 16(%rax);
|
|
|
+ vmovdqu %xmm0, 0 * 16(%rsi);
|
|
|
+
|
|
|
+ /* construct IVs */
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 1 * 16(%rdx), %xmm0, %xmm15;
|
|
|
+ vmovdqu %xmm15, 14 * 16(%rax);
|
|
|
+ vmovdqu %xmm0, 1 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 2 * 16(%rdx), %xmm0, %xmm13;
|
|
|
+ vmovdqu %xmm0, 2 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 3 * 16(%rdx), %xmm0, %xmm12;
|
|
|
+ vmovdqu %xmm0, 3 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 4 * 16(%rdx), %xmm0, %xmm11;
|
|
|
+ vmovdqu %xmm0, 4 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 5 * 16(%rdx), %xmm0, %xmm10;
|
|
|
+ vmovdqu %xmm0, 5 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 6 * 16(%rdx), %xmm0, %xmm9;
|
|
|
+ vmovdqu %xmm0, 6 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 7 * 16(%rdx), %xmm0, %xmm8;
|
|
|
+ vmovdqu %xmm0, 7 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 8 * 16(%rdx), %xmm0, %xmm7;
|
|
|
+ vmovdqu %xmm0, 8 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 9 * 16(%rdx), %xmm0, %xmm6;
|
|
|
+ vmovdqu %xmm0, 9 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 10 * 16(%rdx), %xmm0, %xmm5;
|
|
|
+ vmovdqu %xmm0, 10 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 11 * 16(%rdx), %xmm0, %xmm4;
|
|
|
+ vmovdqu %xmm0, 11 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 12 * 16(%rdx), %xmm0, %xmm3;
|
|
|
+ vmovdqu %xmm0, 12 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 13 * 16(%rdx), %xmm0, %xmm2;
|
|
|
+ vmovdqu %xmm0, 13 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 14 * 16(%rdx), %xmm0, %xmm1;
|
|
|
+ vmovdqu %xmm0, 14 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vpxor 15 * 16(%rdx), %xmm0, %xmm15;
|
|
|
+ vmovdqu %xmm15, 0 * 16(%rax);
|
|
|
+ vmovdqu %xmm0, 15 * 16(%rsi);
|
|
|
+
|
|
|
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
|
|
|
+ vmovdqu %xmm0, (%rcx);
|
|
|
+
|
|
|
+ /* inpack16_pre: */
|
|
|
+ vmovq (key_table)(CTX, %r8, 8), %xmm15;
|
|
|
+ vpshufb .Lpack_bswap, %xmm15, %xmm15;
|
|
|
+ vpxor 0 * 16(%rax), %xmm15, %xmm0;
|
|
|
+ vpxor %xmm1, %xmm15, %xmm1;
|
|
|
+ vpxor %xmm2, %xmm15, %xmm2;
|
|
|
+ vpxor %xmm3, %xmm15, %xmm3;
|
|
|
+ vpxor %xmm4, %xmm15, %xmm4;
|
|
|
+ vpxor %xmm5, %xmm15, %xmm5;
|
|
|
+ vpxor %xmm6, %xmm15, %xmm6;
|
|
|
+ vpxor %xmm7, %xmm15, %xmm7;
|
|
|
+ vpxor %xmm8, %xmm15, %xmm8;
|
|
|
+ vpxor %xmm9, %xmm15, %xmm9;
|
|
|
+ vpxor %xmm10, %xmm15, %xmm10;
|
|
|
+ vpxor %xmm11, %xmm15, %xmm11;
|
|
|
+ vpxor %xmm12, %xmm15, %xmm12;
|
|
|
+ vpxor %xmm13, %xmm15, %xmm13;
|
|
|
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
|
|
|
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
|
|
|
+
|
|
|
+ call *%r9;
|
|
|
+
|
|
|
+ addq $(16 * 16), %rsp;
|
|
|
+
|
|
|
+ vpxor 0 * 16(%rsi), %xmm7, %xmm7;
|
|
|
+ vpxor 1 * 16(%rsi), %xmm6, %xmm6;
|
|
|
+ vpxor 2 * 16(%rsi), %xmm5, %xmm5;
|
|
|
+ vpxor 3 * 16(%rsi), %xmm4, %xmm4;
|
|
|
+ vpxor 4 * 16(%rsi), %xmm3, %xmm3;
|
|
|
+ vpxor 5 * 16(%rsi), %xmm2, %xmm2;
|
|
|
+ vpxor 6 * 16(%rsi), %xmm1, %xmm1;
|
|
|
+ vpxor 7 * 16(%rsi), %xmm0, %xmm0;
|
|
|
+ vpxor 8 * 16(%rsi), %xmm15, %xmm15;
|
|
|
+ vpxor 9 * 16(%rsi), %xmm14, %xmm14;
|
|
|
+ vpxor 10 * 16(%rsi), %xmm13, %xmm13;
|
|
|
+ vpxor 11 * 16(%rsi), %xmm12, %xmm12;
|
|
|
+ vpxor 12 * 16(%rsi), %xmm11, %xmm11;
|
|
|
+ vpxor 13 * 16(%rsi), %xmm10, %xmm10;
|
|
|
+ vpxor 14 * 16(%rsi), %xmm9, %xmm9;
|
|
|
+ vpxor 15 * 16(%rsi), %xmm8, %xmm8;
|
|
|
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
|
|
|
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
|
|
|
+ %xmm8, %rsi);
|
|
|
+
|
|
|
+ ret;
|
|
|
+ENDPROC(camellia_xts_crypt_16way)
|
|
|
+
|
|
|
+ENTRY(camellia_xts_enc_16way)
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst (16 blocks)
|
|
|
+ * %rdx: src (16 blocks)
|
|
|
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
|
|
+ */
|
|
|
+ xorl %r8d, %r8d; /* input whitening key, 0 for enc */
|
|
|
+
|
|
|
+ leaq __camellia_enc_blk16, %r9;
|
|
|
+
|
|
|
+ jmp camellia_xts_crypt_16way;
|
|
|
+ENDPROC(camellia_xts_enc_16way)
|
|
|
+
|
|
|
+ENTRY(camellia_xts_dec_16way)
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst (16 blocks)
|
|
|
+ * %rdx: src (16 blocks)
|
|
|
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
|
|
|
+ */
|
|
|
+
|
|
|
+ cmpl $16, key_length(CTX);
|
|
|
+ movl $32, %r8d;
|
|
|
+ movl $24, %eax;
|
|
|
+ cmovel %eax, %r8d; /* input whitening key, last for dec */
|
|
|
+
|
|
|
+ leaq __camellia_dec_blk16, %r9;
|
|
|
+
|
|
|
+ jmp camellia_xts_crypt_16way;
|
|
|
+ENDPROC(camellia_xts_dec_16way)
|