|
@@ -23,6 +23,8 @@
|
|
|
*
|
|
|
*/
|
|
|
|
|
|
+#include "glue_helper-asm-avx.S"
|
|
|
+
|
|
|
.file "cast6-avx-x86_64-asm_64.S"
|
|
|
|
|
|
.extern cast6_s1
|
|
@@ -205,11 +207,7 @@
|
|
|
vpunpcklqdq x3, t2, x2; \
|
|
|
vpunpckhqdq x3, t2, x3;
|
|
|
|
|
|
-#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
|
|
|
- vmovdqu (0*4*4)(in), x0; \
|
|
|
- vmovdqu (1*4*4)(in), x1; \
|
|
|
- vmovdqu (2*4*4)(in), x2; \
|
|
|
- vmovdqu (3*4*4)(in), x3; \
|
|
|
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
|
|
|
vpshufb rmask, x0, x0; \
|
|
|
vpshufb rmask, x1, x1; \
|
|
|
vpshufb rmask, x2, x2; \
|
|
@@ -217,39 +215,21 @@
|
|
|
\
|
|
|
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
|
|
|
|
|
|
-#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
|
|
|
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
|
|
|
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
|
|
\
|
|
|
vpshufb rmask, x0, x0; \
|
|
|
vpshufb rmask, x1, x1; \
|
|
|
vpshufb rmask, x2, x2; \
|
|
|
- vpshufb rmask, x3, x3; \
|
|
|
- vmovdqu x0, (0*4*4)(out); \
|
|
|
- vmovdqu x1, (1*4*4)(out); \
|
|
|
- vmovdqu x2, (2*4*4)(out); \
|
|
|
- vmovdqu x3, (3*4*4)(out);
|
|
|
-
|
|
|
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
|
|
|
- transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
|
|
|
- \
|
|
|
- vpshufb rmask, x0, x0; \
|
|
|
- vpshufb rmask, x1, x1; \
|
|
|
- vpshufb rmask, x2, x2; \
|
|
|
- vpshufb rmask, x3, x3; \
|
|
|
- vpxor (0*4*4)(out), x0, x0; \
|
|
|
- vmovdqu x0, (0*4*4)(out); \
|
|
|
- vpxor (1*4*4)(out), x1, x1; \
|
|
|
- vmovdqu x1, (1*4*4)(out); \
|
|
|
- vpxor (2*4*4)(out), x2, x2; \
|
|
|
- vmovdqu x2, (2*4*4)(out); \
|
|
|
- vpxor (3*4*4)(out), x3, x3; \
|
|
|
- vmovdqu x3, (3*4*4)(out);
|
|
|
+ vpshufb rmask, x3, x3;
|
|
|
|
|
|
.data
|
|
|
|
|
|
.align 16
|
|
|
.Lbswap_mask:
|
|
|
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
|
|
+.Lbswap128_mask:
|
|
|
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
|
|
|
.Lrkr_enc_Q_Q_QBAR_QBAR:
|
|
|
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
|
|
|
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
|
|
@@ -269,31 +249,26 @@
|
|
|
|
|
|
.text
|
|
|
|
|
|
-.align 16
|
|
|
-.global __cast6_enc_blk_8way
|
|
|
-.type __cast6_enc_blk_8way,@function;
|
|
|
+.align 8
|
|
|
+.type __cast6_enc_blk8,@function;
|
|
|
|
|
|
-__cast6_enc_blk_8way:
|
|
|
+__cast6_enc_blk8:
|
|
|
/* input:
|
|
|
* %rdi: ctx, CTX
|
|
|
- * %rsi: dst
|
|
|
- * %rdx: src
|
|
|
- * %rcx: bool, if true: xor output
|
|
|
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
|
|
|
+ * output:
|
|
|
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
|
|
|
*/
|
|
|
|
|
|
pushq %rbp;
|
|
|
pushq %rbx;
|
|
|
- pushq %rcx;
|
|
|
|
|
|
vmovdqa .Lbswap_mask, RKM;
|
|
|
vmovd .Lfirst_mask, R1ST;
|
|
|
vmovd .L32_mask, R32;
|
|
|
|
|
|
- leaq (4*4*4)(%rdx), %rax;
|
|
|
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
-
|
|
|
- movq %rsi, %r11;
|
|
|
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
|
|
|
preload_rkr(0, dummy, none);
|
|
|
Q(0);
|
|
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
|
|
|
QBAR(10);
|
|
|
QBAR(11);
|
|
|
|
|
|
- popq %rcx;
|
|
|
popq %rbx;
|
|
|
popq %rbp;
|
|
|
|
|
|
vmovdqa .Lbswap_mask, RKM;
|
|
|
- leaq (4*4*4)(%r11), %rax;
|
|
|
-
|
|
|
- testb %cl, %cl;
|
|
|
- jnz __enc_xor8;
|
|
|
-
|
|
|
- outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
-
|
|
|
- ret;
|
|
|
|
|
|
-__enc_xor8:
|
|
|
- outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
- outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
|
|
|
ret;
|
|
|
|
|
|
-.align 16
|
|
|
-.global cast6_dec_blk_8way
|
|
|
-.type cast6_dec_blk_8way,@function;
|
|
|
+.align 8
|
|
|
+.type __cast6_dec_blk8,@function;
|
|
|
|
|
|
-cast6_dec_blk_8way:
|
|
|
+__cast6_dec_blk8:
|
|
|
/* input:
|
|
|
* %rdi: ctx, CTX
|
|
|
- * %rsi: dst
|
|
|
- * %rdx: src
|
|
|
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
|
|
|
+ * output:
|
|
|
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
|
|
|
*/
|
|
|
|
|
|
pushq %rbp;
|
|
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
|
|
|
vmovd .Lfirst_mask, R1ST;
|
|
|
vmovd .L32_mask, R32;
|
|
|
|
|
|
- leaq (4*4*4)(%rdx), %rax;
|
|
|
- inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
- inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
-
|
|
|
- movq %rsi, %r11;
|
|
|
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
|
|
|
preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
|
|
|
Q(11);
|
|
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
|
|
|
popq %rbp;
|
|
|
|
|
|
vmovdqa .Lbswap_mask, RKM;
|
|
|
- leaq (4*4*4)(%r11), %rax;
|
|
|
- outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
- outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
|
|
|
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
|
|
|
+
|
|
|
+ ret;
|
|
|
+
|
|
|
+.align 8
|
|
|
+.global cast6_ecb_enc_8way
|
|
|
+.type cast6_ecb_enc_8way,@function;
|
|
|
+
|
|
|
+cast6_ecb_enc_8way:
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst
|
|
|
+ * %rdx: src
|
|
|
+ */
|
|
|
+
|
|
|
+ movq %rsi, %r11;
|
|
|
+
|
|
|
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ call __cast6_enc_blk8;
|
|
|
+
|
|
|
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ ret;
|
|
|
+
|
|
|
+.align 8
|
|
|
+.global cast6_ecb_dec_8way
|
|
|
+.type cast6_ecb_dec_8way,@function;
|
|
|
+
|
|
|
+cast6_ecb_dec_8way:
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst
|
|
|
+ * %rdx: src
|
|
|
+ */
|
|
|
+
|
|
|
+ movq %rsi, %r11;
|
|
|
+
|
|
|
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ call __cast6_dec_blk8;
|
|
|
+
|
|
|
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ ret;
|
|
|
+
|
|
|
+.align 8
|
|
|
+.global cast6_cbc_dec_8way
|
|
|
+.type cast6_cbc_dec_8way,@function;
|
|
|
+
|
|
|
+cast6_cbc_dec_8way:
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst
|
|
|
+ * %rdx: src
|
|
|
+ */
|
|
|
+
|
|
|
+ pushq %r12;
|
|
|
+
|
|
|
+ movq %rsi, %r11;
|
|
|
+ movq %rdx, %r12;
|
|
|
+
|
|
|
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ call __cast6_dec_blk8;
|
|
|
+
|
|
|
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ popq %r12;
|
|
|
+
|
|
|
+ ret;
|
|
|
+
|
|
|
+.align 8
|
|
|
+.global cast6_ctr_8way
|
|
|
+.type cast6_ctr_8way,@function;
|
|
|
+
|
|
|
+cast6_ctr_8way:
|
|
|
+ /* input:
|
|
|
+ * %rdi: ctx, CTX
|
|
|
+ * %rsi: dst
|
|
|
+ * %rdx: src
|
|
|
+ * %rcx: iv (little endian, 128bit)
|
|
|
+ */
|
|
|
+
|
|
|
+ pushq %r12;
|
|
|
+
|
|
|
+ movq %rsi, %r11;
|
|
|
+ movq %rdx, %r12;
|
|
|
+
|
|
|
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
|
|
|
+ RD2, RX, RKR, RKM);
|
|
|
+
|
|
|
+ call __cast6_enc_blk8;
|
|
|
+
|
|
|
+ store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
|
|
|
+
|
|
|
+ popq %r12;
|
|
|
|
|
|
ret;
|