forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
499 lines
10 KiB
499 lines
10 KiB
/* SPDX-License-Identifier: GPL-2.0-or-later */ |
|
/* |
|
* Camellia Cipher Algorithm (x86_64) |
|
* |
|
* Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> |
|
*/ |
|
|
|
#include <linux/linkage.h> |
|
|
|
.file "camellia-x86_64-asm_64.S" |
|
.text |
|
|
|
.extern camellia_sp10011110; |
|
.extern camellia_sp22000222; |
|
.extern camellia_sp03303033; |
|
.extern camellia_sp00444404; |
|
.extern camellia_sp02220222; |
|
.extern camellia_sp30333033; |
|
.extern camellia_sp44044404; |
|
.extern camellia_sp11101110; |
|
|
|
#define sp10011110 camellia_sp10011110 |
|
#define sp22000222 camellia_sp22000222 |
|
#define sp03303033 camellia_sp03303033 |
|
#define sp00444404 camellia_sp00444404 |
|
#define sp02220222 camellia_sp02220222 |
|
#define sp30333033 camellia_sp30333033 |
|
#define sp44044404 camellia_sp44044404 |
|
#define sp11101110 camellia_sp11101110 |
|
|
|
#define CAMELLIA_TABLE_BYTE_LEN 272 |
|
|
|
/* struct camellia_ctx: */ |
|
#define key_table 0 |
|
#define key_length CAMELLIA_TABLE_BYTE_LEN |
|
|
|
/* register macros */ |
|
#define CTX %rdi |
|
#define RIO %rsi |
|
#define RIOd %esi |
|
|
|
#define RAB0 %rax |
|
#define RCD0 %rcx |
|
#define RAB1 %rbx |
|
#define RCD1 %rdx |
|
|
|
#define RAB0d %eax |
|
#define RCD0d %ecx |
|
#define RAB1d %ebx |
|
#define RCD1d %edx |
|
|
|
#define RAB0bl %al |
|
#define RCD0bl %cl |
|
#define RAB1bl %bl |
|
#define RCD1bl %dl |
|
|
|
#define RAB0bh %ah |
|
#define RCD0bh %ch |
|
#define RAB1bh %bh |
|
#define RCD1bh %dh |
|
|
|
#define RT0 %rsi |
|
#define RT1 %r12 |
|
#define RT2 %r8 |
|
|
|
#define RT0d %esi |
|
#define RT1d %r12d |
|
#define RT2d %r8d |
|
|
|
#define RT2bl %r8b |
|
|
|
#define RXOR %r9 |
|
#define RR12 %r10 |
|
#define RDST %r11 |
|
|
|
#define RXORd %r9d |
|
#define RXORbl %r9b |
|
|
|
#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ |
|
movzbl ab ## bl, tmp2 ## d; \ |
|
movzbl ab ## bh, tmp1 ## d; \ |
|
rorq $16, ab; \ |
|
xorq T0(, tmp2, 8), dst; \ |
|
xorq T1(, tmp1, 8), dst; |
|
|
|
/********************************************************************** |
|
1-way camellia |
|
**********************************************************************/ |
|
#define roundsm(ab, subkey, cd) \ |
|
movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ |
|
\ |
|
xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ |
|
xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ |
|
xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ |
|
xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ |
|
\ |
|
xorq RT2, cd ## 0; |
|
|
|
#define fls(l, r, kl, kr) \ |
|
movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ |
|
andl l ## 0d, RT0d; \ |
|
roll $1, RT0d; \ |
|
shlq $32, RT0; \ |
|
xorq RT0, l ## 0; \ |
|
movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ |
|
orq r ## 0, RT1; \ |
|
shrq $32, RT1; \ |
|
xorq RT1, r ## 0; \ |
|
\ |
|
movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ |
|
orq l ## 0, RT2; \ |
|
shrq $32, RT2; \ |
|
xorq RT2, l ## 0; \ |
|
movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ |
|
andl r ## 0d, RT0d; \ |
|
roll $1, RT0d; \ |
|
shlq $32, RT0; \ |
|
xorq RT0, r ## 0; |
|
|
|
#define enc_rounds(i) \ |
|
roundsm(RAB, i + 2, RCD); \ |
|
roundsm(RCD, i + 3, RAB); \ |
|
roundsm(RAB, i + 4, RCD); \ |
|
roundsm(RCD, i + 5, RAB); \ |
|
roundsm(RAB, i + 6, RCD); \ |
|
roundsm(RCD, i + 7, RAB); |
|
|
|
#define enc_fls(i) \ |
|
fls(RAB, RCD, i + 0, i + 1); |
|
|
|
#define enc_inpack() \ |
|
movq (RIO), RAB0; \ |
|
bswapq RAB0; \ |
|
rolq $32, RAB0; \ |
|
movq 4*2(RIO), RCD0; \ |
|
bswapq RCD0; \ |
|
rorq $32, RCD0; \ |
|
xorq key_table(CTX), RAB0; |
|
|
|
#define enc_outunpack(op, max) \ |
|
xorq key_table(CTX, max, 8), RCD0; \ |
|
rorq $32, RCD0; \ |
|
bswapq RCD0; \ |
|
op ## q RCD0, (RIO); \ |
|
rolq $32, RAB0; \ |
|
bswapq RAB0; \ |
|
op ## q RAB0, 4*2(RIO); |
|
|
|
#define dec_rounds(i) \ |
|
roundsm(RAB, i + 7, RCD); \ |
|
roundsm(RCD, i + 6, RAB); \ |
|
roundsm(RAB, i + 5, RCD); \ |
|
roundsm(RCD, i + 4, RAB); \ |
|
roundsm(RAB, i + 3, RCD); \ |
|
roundsm(RCD, i + 2, RAB); |
|
|
|
#define dec_fls(i) \ |
|
fls(RAB, RCD, i + 1, i + 0); |
|
|
|
#define dec_inpack(max) \ |
|
movq (RIO), RAB0; \ |
|
bswapq RAB0; \ |
|
rolq $32, RAB0; \ |
|
movq 4*2(RIO), RCD0; \ |
|
bswapq RCD0; \ |
|
rorq $32, RCD0; \ |
|
xorq key_table(CTX, max, 8), RAB0; |
|
|
|
#define dec_outunpack() \ |
|
xorq key_table(CTX), RCD0; \ |
|
rorq $32, RCD0; \ |
|
bswapq RCD0; \ |
|
movq RCD0, (RIO); \ |
|
rolq $32, RAB0; \ |
|
bswapq RAB0; \ |
|
movq RAB0, 4*2(RIO); |
|
|
|
SYM_FUNC_START(__camellia_enc_blk) |
|
/* input: |
|
* %rdi: ctx, CTX |
|
* %rsi: dst |
|
* %rdx: src |
|
* %rcx: bool xor |
|
*/ |
|
movq %r12, RR12; |
|
|
|
movq %rcx, RXOR; |
|
movq %rsi, RDST; |
|
movq %rdx, RIO; |
|
|
|
enc_inpack(); |
|
|
|
enc_rounds(0); |
|
enc_fls(8); |
|
enc_rounds(8); |
|
enc_fls(16); |
|
enc_rounds(16); |
|
movl $24, RT1d; /* max */ |
|
|
|
cmpb $16, key_length(CTX); |
|
je .L__enc_done; |
|
|
|
enc_fls(24); |
|
enc_rounds(24); |
|
movl $32, RT1d; /* max */ |
|
|
|
.L__enc_done: |
|
testb RXORbl, RXORbl; |
|
movq RDST, RIO; |
|
|
|
jnz .L__enc_xor; |
|
|
|
enc_outunpack(mov, RT1); |
|
|
|
movq RR12, %r12; |
|
ret; |
|
|
|
.L__enc_xor: |
|
enc_outunpack(xor, RT1); |
|
|
|
movq RR12, %r12; |
|
ret; |
|
SYM_FUNC_END(__camellia_enc_blk) |
|
|
|
SYM_FUNC_START(camellia_dec_blk) |
|
/* input: |
|
* %rdi: ctx, CTX |
|
* %rsi: dst |
|
* %rdx: src |
|
*/ |
|
cmpl $16, key_length(CTX); |
|
movl $32, RT2d; |
|
movl $24, RXORd; |
|
cmovel RXORd, RT2d; /* max */ |
|
|
|
movq %r12, RR12; |
|
movq %rsi, RDST; |
|
movq %rdx, RIO; |
|
|
|
dec_inpack(RT2); |
|
|
|
cmpb $24, RT2bl; |
|
je .L__dec_rounds16; |
|
|
|
dec_rounds(24); |
|
dec_fls(24); |
|
|
|
.L__dec_rounds16: |
|
dec_rounds(16); |
|
dec_fls(16); |
|
dec_rounds(8); |
|
dec_fls(8); |
|
dec_rounds(0); |
|
|
|
movq RDST, RIO; |
|
|
|
dec_outunpack(); |
|
|
|
movq RR12, %r12; |
|
ret; |
|
SYM_FUNC_END(camellia_dec_blk) |
|
|
|
/********************************************************************** |
|
2-way camellia |
|
**********************************************************************/ |
|
#define roundsm2(ab, subkey, cd) \ |
|
movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ |
|
xorq RT2, cd ## 1; \ |
|
\ |
|
xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ |
|
xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ |
|
xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ |
|
xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ |
|
\ |
|
xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ |
|
xorq RT2, cd ## 0; \ |
|
xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ |
|
xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ |
|
xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); |
|
|
|
#define fls2(l, r, kl, kr) \ |
|
movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ |
|
andl l ## 0d, RT0d; \ |
|
roll $1, RT0d; \ |
|
shlq $32, RT0; \ |
|
xorq RT0, l ## 0; \ |
|
movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ |
|
orq r ## 0, RT1; \ |
|
shrq $32, RT1; \ |
|
xorq RT1, r ## 0; \ |
|
\ |
|
movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ |
|
andl l ## 1d, RT2d; \ |
|
roll $1, RT2d; \ |
|
shlq $32, RT2; \ |
|
xorq RT2, l ## 1; \ |
|
movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ |
|
orq r ## 1, RT0; \ |
|
shrq $32, RT0; \ |
|
xorq RT0, r ## 1; \ |
|
\ |
|
movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ |
|
orq l ## 0, RT1; \ |
|
shrq $32, RT1; \ |
|
xorq RT1, l ## 0; \ |
|
movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ |
|
andl r ## 0d, RT2d; \ |
|
roll $1, RT2d; \ |
|
shlq $32, RT2; \ |
|
xorq RT2, r ## 0; \ |
|
\ |
|
movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ |
|
orq l ## 1, RT0; \ |
|
shrq $32, RT0; \ |
|
xorq RT0, l ## 1; \ |
|
movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ |
|
andl r ## 1d, RT1d; \ |
|
roll $1, RT1d; \ |
|
shlq $32, RT1; \ |
|
xorq RT1, r ## 1; |
|
|
|
#define enc_rounds2(i) \ |
|
roundsm2(RAB, i + 2, RCD); \ |
|
roundsm2(RCD, i + 3, RAB); \ |
|
roundsm2(RAB, i + 4, RCD); \ |
|
roundsm2(RCD, i + 5, RAB); \ |
|
roundsm2(RAB, i + 6, RCD); \ |
|
roundsm2(RCD, i + 7, RAB); |
|
|
|
#define enc_fls2(i) \ |
|
fls2(RAB, RCD, i + 0, i + 1); |
|
|
|
#define enc_inpack2() \ |
|
movq (RIO), RAB0; \ |
|
bswapq RAB0; \ |
|
rorq $32, RAB0; \ |
|
movq 4*2(RIO), RCD0; \ |
|
bswapq RCD0; \ |
|
rolq $32, RCD0; \ |
|
xorq key_table(CTX), RAB0; \ |
|
\ |
|
movq 8*2(RIO), RAB1; \ |
|
bswapq RAB1; \ |
|
rorq $32, RAB1; \ |
|
movq 12*2(RIO), RCD1; \ |
|
bswapq RCD1; \ |
|
rolq $32, RCD1; \ |
|
xorq key_table(CTX), RAB1; |
|
|
|
#define enc_outunpack2(op, max) \ |
|
xorq key_table(CTX, max, 8), RCD0; \ |
|
rolq $32, RCD0; \ |
|
bswapq RCD0; \ |
|
op ## q RCD0, (RIO); \ |
|
rorq $32, RAB0; \ |
|
bswapq RAB0; \ |
|
op ## q RAB0, 4*2(RIO); \ |
|
\ |
|
xorq key_table(CTX, max, 8), RCD1; \ |
|
rolq $32, RCD1; \ |
|
bswapq RCD1; \ |
|
op ## q RCD1, 8*2(RIO); \ |
|
rorq $32, RAB1; \ |
|
bswapq RAB1; \ |
|
op ## q RAB1, 12*2(RIO); |
|
|
|
#define dec_rounds2(i) \ |
|
roundsm2(RAB, i + 7, RCD); \ |
|
roundsm2(RCD, i + 6, RAB); \ |
|
roundsm2(RAB, i + 5, RCD); \ |
|
roundsm2(RCD, i + 4, RAB); \ |
|
roundsm2(RAB, i + 3, RCD); \ |
|
roundsm2(RCD, i + 2, RAB); |
|
|
|
#define dec_fls2(i) \ |
|
fls2(RAB, RCD, i + 1, i + 0); |
|
|
|
#define dec_inpack2(max) \ |
|
movq (RIO), RAB0; \ |
|
bswapq RAB0; \ |
|
rorq $32, RAB0; \ |
|
movq 4*2(RIO), RCD0; \ |
|
bswapq RCD0; \ |
|
rolq $32, RCD0; \ |
|
xorq key_table(CTX, max, 8), RAB0; \ |
|
\ |
|
movq 8*2(RIO), RAB1; \ |
|
bswapq RAB1; \ |
|
rorq $32, RAB1; \ |
|
movq 12*2(RIO), RCD1; \ |
|
bswapq RCD1; \ |
|
rolq $32, RCD1; \ |
|
xorq key_table(CTX, max, 8), RAB1; |
|
|
|
#define dec_outunpack2() \ |
|
xorq key_table(CTX), RCD0; \ |
|
rolq $32, RCD0; \ |
|
bswapq RCD0; \ |
|
movq RCD0, (RIO); \ |
|
rorq $32, RAB0; \ |
|
bswapq RAB0; \ |
|
movq RAB0, 4*2(RIO); \ |
|
\ |
|
xorq key_table(CTX), RCD1; \ |
|
rolq $32, RCD1; \ |
|
bswapq RCD1; \ |
|
movq RCD1, 8*2(RIO); \ |
|
rorq $32, RAB1; \ |
|
bswapq RAB1; \ |
|
movq RAB1, 12*2(RIO); |
|
|
|
SYM_FUNC_START(__camellia_enc_blk_2way) |
|
/* input: |
|
* %rdi: ctx, CTX |
|
* %rsi: dst |
|
* %rdx: src |
|
* %rcx: bool xor |
|
*/ |
|
pushq %rbx; |
|
|
|
movq %r12, RR12; |
|
movq %rcx, RXOR; |
|
movq %rsi, RDST; |
|
movq %rdx, RIO; |
|
|
|
enc_inpack2(); |
|
|
|
enc_rounds2(0); |
|
enc_fls2(8); |
|
enc_rounds2(8); |
|
enc_fls2(16); |
|
enc_rounds2(16); |
|
movl $24, RT2d; /* max */ |
|
|
|
cmpb $16, key_length(CTX); |
|
je .L__enc2_done; |
|
|
|
enc_fls2(24); |
|
enc_rounds2(24); |
|
movl $32, RT2d; /* max */ |
|
|
|
.L__enc2_done: |
|
test RXORbl, RXORbl; |
|
movq RDST, RIO; |
|
jnz .L__enc2_xor; |
|
|
|
enc_outunpack2(mov, RT2); |
|
|
|
movq RR12, %r12; |
|
popq %rbx; |
|
ret; |
|
|
|
.L__enc2_xor: |
|
enc_outunpack2(xor, RT2); |
|
|
|
movq RR12, %r12; |
|
popq %rbx; |
|
ret; |
|
SYM_FUNC_END(__camellia_enc_blk_2way) |
|
|
|
SYM_FUNC_START(camellia_dec_blk_2way) |
|
/* input: |
|
* %rdi: ctx, CTX |
|
* %rsi: dst |
|
* %rdx: src |
|
*/ |
|
cmpl $16, key_length(CTX); |
|
movl $32, RT2d; |
|
movl $24, RXORd; |
|
cmovel RXORd, RT2d; /* max */ |
|
|
|
movq %rbx, RXOR; |
|
movq %r12, RR12; |
|
movq %rsi, RDST; |
|
movq %rdx, RIO; |
|
|
|
dec_inpack2(RT2); |
|
|
|
cmpb $24, RT2bl; |
|
je .L__dec2_rounds16; |
|
|
|
dec_rounds2(24); |
|
dec_fls2(24); |
|
|
|
.L__dec2_rounds16: |
|
dec_rounds2(16); |
|
dec_fls2(16); |
|
dec_rounds2(8); |
|
dec_fls2(8); |
|
dec_rounds2(0); |
|
|
|
movq RDST, RIO; |
|
|
|
dec_outunpack2(); |
|
|
|
movq RR12, %r12; |
|
movq RXOR, %rbx; |
|
ret; |
|
SYM_FUNC_END(camellia_dec_blk_2way)
|
|
|