mirror of https://github.com/Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
497 lines
9.9 KiB
497 lines
9.9 KiB
/* SPDX-License-Identifier: GPL-2.0 OR MIT */ |
|
/* |
|
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. |
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
|
*/ |
|
|
|
#define MASK_U32 0x3c |
|
#define CHACHA20_BLOCK_SIZE 64 |
|
#define STACK_SIZE 32 |
|
|
|
#define X0 $t0 |
|
#define X1 $t1 |
|
#define X2 $t2 |
|
#define X3 $t3 |
|
#define X4 $t4 |
|
#define X5 $t5 |
|
#define X6 $t6 |
|
#define X7 $t7 |
|
#define X8 $t8 |
|
#define X9 $t9 |
|
#define X10 $v1 |
|
#define X11 $s6 |
|
#define X12 $s5 |
|
#define X13 $s4 |
|
#define X14 $s3 |
|
#define X15 $s2 |
|
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ |
|
#define T0 $s1 |
|
#define T1 $s0 |
|
#define T(n) T ## n |
|
#define X(n) X ## n |
|
|
|
/* Input arguments */ |
|
#define STATE $a0 |
|
#define OUT $a1 |
|
#define IN $a2 |
|
#define BYTES $a3 |
|
|
|
/* Output argument */ |
|
/* NONCE[0] is kept in a register and not in memory. |
|
* We don't want to touch original value in memory. |
|
* Must be incremented every loop iteration. |
|
*/ |
|
#define NONCE_0 $v0 |
|
|
|
/* SAVED_X and SAVED_CA are set in the jump table. |
|
* Use regs which are overwritten on exit else we don't leak clear data. |
|
* They are used to handling the last bytes which are not multiple of 4. |
|
*/ |
|
#define SAVED_X X15 |
|
#define SAVED_CA $s7 |
|
|
|
#define IS_UNALIGNED $s7 |
|
|
|
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
|
#define MSB 0 |
|
#define LSB 3 |
|
#define ROTx rotl |
|
#define ROTR(n) rotr n, 24 |
|
#define CPU_TO_LE32(n) \ |
|
wsbh n; \ |
|
rotr n, 16; |
|
#else |
|
#define MSB 3 |
|
#define LSB 0 |
|
#define ROTx rotr |
|
#define CPU_TO_LE32(n) |
|
#define ROTR(n) |
|
#endif |
|
|
|
#define FOR_EACH_WORD(x) \ |
|
x( 0); \ |
|
x( 1); \ |
|
x( 2); \ |
|
x( 3); \ |
|
x( 4); \ |
|
x( 5); \ |
|
x( 6); \ |
|
x( 7); \ |
|
x( 8); \ |
|
x( 9); \ |
|
x(10); \ |
|
x(11); \ |
|
x(12); \ |
|
x(13); \ |
|
x(14); \ |
|
x(15); |
|
|
|
#define FOR_EACH_WORD_REV(x) \ |
|
x(15); \ |
|
x(14); \ |
|
x(13); \ |
|
x(12); \ |
|
x(11); \ |
|
x(10); \ |
|
x( 9); \ |
|
x( 8); \ |
|
x( 7); \ |
|
x( 6); \ |
|
x( 5); \ |
|
x( 4); \ |
|
x( 3); \ |
|
x( 2); \ |
|
x( 1); \ |
|
x( 0); |
|
|
|
#define PLUS_ONE_0 1 |
|
#define PLUS_ONE_1 2 |
|
#define PLUS_ONE_2 3 |
|
#define PLUS_ONE_3 4 |
|
#define PLUS_ONE_4 5 |
|
#define PLUS_ONE_5 6 |
|
#define PLUS_ONE_6 7 |
|
#define PLUS_ONE_7 8 |
|
#define PLUS_ONE_8 9 |
|
#define PLUS_ONE_9 10 |
|
#define PLUS_ONE_10 11 |
|
#define PLUS_ONE_11 12 |
|
#define PLUS_ONE_12 13 |
|
#define PLUS_ONE_13 14 |
|
#define PLUS_ONE_14 15 |
|
#define PLUS_ONE_15 16 |
|
#define PLUS_ONE(x) PLUS_ONE_ ## x |
|
#define _CONCAT3(a,b,c) a ## b ## c |
|
#define CONCAT3(a,b,c) _CONCAT3(a,b,c) |
|
|
|
#define STORE_UNALIGNED(x) \ |
|
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ |
|
.if (x != 12); \ |
|
lw T0, (x*4)(STATE); \ |
|
.endif; \ |
|
lwl T1, (x*4)+MSB ## (IN); \ |
|
lwr T1, (x*4)+LSB ## (IN); \ |
|
.if (x == 12); \ |
|
addu X ## x, NONCE_0; \ |
|
.else; \ |
|
addu X ## x, T0; \ |
|
.endif; \ |
|
CPU_TO_LE32(X ## x); \ |
|
xor X ## x, T1; \ |
|
swl X ## x, (x*4)+MSB ## (OUT); \ |
|
swr X ## x, (x*4)+LSB ## (OUT); |
|
|
|
#define STORE_ALIGNED(x) \ |
|
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ |
|
.if (x != 12); \ |
|
lw T0, (x*4)(STATE); \ |
|
.endif; \ |
|
lw T1, (x*4) ## (IN); \ |
|
.if (x == 12); \ |
|
addu X ## x, NONCE_0; \ |
|
.else; \ |
|
addu X ## x, T0; \ |
|
.endif; \ |
|
CPU_TO_LE32(X ## x); \ |
|
xor X ## x, T1; \ |
|
sw X ## x, (x*4) ## (OUT); |
|
|
|
/* Jump table macro. |
|
* Used for setup and handling the last bytes, which are not multiple of 4. |
|
* X15 is free to store Xn |
|
* Every jumptable entry must be equal in size. |
|
*/ |
|
#define JMPTBL_ALIGNED(x) \ |
|
.Lchacha_mips_jmptbl_aligned_ ## x: ; \ |
|
.set noreorder; \ |
|
b .Lchacha_mips_xor_aligned_ ## x ## _b; \ |
|
.if (x == 12); \ |
|
addu SAVED_X, X ## x, NONCE_0; \ |
|
.else; \ |
|
addu SAVED_X, X ## x, SAVED_CA; \ |
|
.endif; \ |
|
.set reorder |
|
|
|
#define JMPTBL_UNALIGNED(x) \ |
|
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ |
|
.set noreorder; \ |
|
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ |
|
.if (x == 12); \ |
|
addu SAVED_X, X ## x, NONCE_0; \ |
|
.else; \ |
|
addu SAVED_X, X ## x, SAVED_CA; \ |
|
.endif; \ |
|
.set reorder |
|
|
|
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ |
|
addu X(A), X(K); \ |
|
addu X(B), X(L); \ |
|
addu X(C), X(M); \ |
|
addu X(D), X(N); \ |
|
xor X(V), X(A); \ |
|
xor X(W), X(B); \ |
|
xor X(Y), X(C); \ |
|
xor X(Z), X(D); \ |
|
rotl X(V), S; \ |
|
rotl X(W), S; \ |
|
rotl X(Y), S; \ |
|
rotl X(Z), S; |
|
|
|
.text |
|
.set reorder |
|
.set noat |
|
.globl chacha_crypt_arch |
|
.ent chacha_crypt_arch |
|
chacha_crypt_arch: |
|
.frame $sp, STACK_SIZE, $ra |
|
|
|
/* Load number of rounds */ |
|
lw $at, 16($sp) |
|
|
|
addiu $sp, -STACK_SIZE |
|
|
|
/* Return bytes = 0. */ |
|
beqz BYTES, .Lchacha_mips_end |
|
|
|
lw NONCE_0, 48(STATE) |
|
|
|
/* Save s0-s7 */ |
|
sw $s0, 0($sp) |
|
sw $s1, 4($sp) |
|
sw $s2, 8($sp) |
|
sw $s3, 12($sp) |
|
sw $s4, 16($sp) |
|
sw $s5, 20($sp) |
|
sw $s6, 24($sp) |
|
sw $s7, 28($sp) |
|
|
|
/* Test IN or OUT is unaligned. |
|
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003 |
|
*/ |
|
or IS_UNALIGNED, IN, OUT |
|
andi IS_UNALIGNED, 0x3 |
|
|
|
b .Lchacha_rounds_start |
|
|
|
.align 4 |
|
.Loop_chacha_rounds: |
|
addiu IN, CHACHA20_BLOCK_SIZE |
|
addiu OUT, CHACHA20_BLOCK_SIZE |
|
addiu NONCE_0, 1 |
|
|
|
.Lchacha_rounds_start: |
|
lw X0, 0(STATE) |
|
lw X1, 4(STATE) |
|
lw X2, 8(STATE) |
|
lw X3, 12(STATE) |
|
|
|
lw X4, 16(STATE) |
|
lw X5, 20(STATE) |
|
lw X6, 24(STATE) |
|
lw X7, 28(STATE) |
|
lw X8, 32(STATE) |
|
lw X9, 36(STATE) |
|
lw X10, 40(STATE) |
|
lw X11, 44(STATE) |
|
|
|
move X12, NONCE_0 |
|
lw X13, 52(STATE) |
|
lw X14, 56(STATE) |
|
lw X15, 60(STATE) |
|
|
|
.Loop_chacha_xor_rounds: |
|
addiu $at, -2 |
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); |
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); |
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); |
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); |
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); |
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); |
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); |
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); |
|
bnez $at, .Loop_chacha_xor_rounds |
|
|
|
addiu BYTES, -(CHACHA20_BLOCK_SIZE) |
|
|
|
/* Is data src/dst unaligned? Jump */ |
|
bnez IS_UNALIGNED, .Loop_chacha_unaligned |
|
|
|
/* Set number rounds here to fill delayslot. */ |
|
lw $at, (STACK_SIZE+16)($sp) |
|
|
|
/* BYTES < 0, it has no full block. */ |
|
bltz BYTES, .Lchacha_mips_no_full_block_aligned |
|
|
|
FOR_EACH_WORD_REV(STORE_ALIGNED) |
|
|
|
/* BYTES > 0? Loop again. */ |
|
bgtz BYTES, .Loop_chacha_rounds |
|
|
|
/* Place this here to fill delay slot */ |
|
addiu NONCE_0, 1 |
|
|
|
/* BYTES < 0? Handle last bytes */ |
|
bltz BYTES, .Lchacha_mips_xor_bytes |
|
|
|
.Lchacha_mips_xor_done: |
|
/* Restore used registers */ |
|
lw $s0, 0($sp) |
|
lw $s1, 4($sp) |
|
lw $s2, 8($sp) |
|
lw $s3, 12($sp) |
|
lw $s4, 16($sp) |
|
lw $s5, 20($sp) |
|
lw $s6, 24($sp) |
|
lw $s7, 28($sp) |
|
|
|
/* Write NONCE_0 back to right location in state */ |
|
sw NONCE_0, 48(STATE) |
|
|
|
.Lchacha_mips_end: |
|
addiu $sp, STACK_SIZE |
|
jr $ra |
|
|
|
.Lchacha_mips_no_full_block_aligned: |
|
/* Restore the offset on BYTES */ |
|
addiu BYTES, CHACHA20_BLOCK_SIZE |
|
|
|
/* Get number of full WORDS */ |
|
andi $at, BYTES, MASK_U32 |
|
|
|
/* Load upper half of jump table addr */ |
|
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) |
|
|
|
/* Calculate lower half jump table offset */ |
|
ins T0, $at, 1, 6 |
|
|
|
/* Add offset to STATE */ |
|
addu T1, STATE, $at |
|
|
|
/* Add lower half jump table addr */ |
|
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) |
|
|
|
/* Read value from STATE */ |
|
lw SAVED_CA, 0(T1) |
|
|
|
/* Store remaining bytecounter as negative value */ |
|
subu BYTES, $at, BYTES |
|
|
|
jr T0 |
|
|
|
/* Jump table */ |
|
FOR_EACH_WORD(JMPTBL_ALIGNED) |
|
|
|
|
|
.Loop_chacha_unaligned: |
|
/* Set number rounds here to fill delayslot. */ |
|
lw $at, (STACK_SIZE+16)($sp) |
|
|
|
/* BYTES > 0, it has no full block. */ |
|
bltz BYTES, .Lchacha_mips_no_full_block_unaligned |
|
|
|
FOR_EACH_WORD_REV(STORE_UNALIGNED) |
|
|
|
/* BYTES > 0? Loop again. */ |
|
bgtz BYTES, .Loop_chacha_rounds |
|
|
|
/* Write NONCE_0 back to right location in state */ |
|
sw NONCE_0, 48(STATE) |
|
|
|
.set noreorder |
|
/* Fall through to byte handling */ |
|
bgez BYTES, .Lchacha_mips_xor_done |
|
.Lchacha_mips_xor_unaligned_0_b: |
|
.Lchacha_mips_xor_aligned_0_b: |
|
/* Place this here to fill delay slot */ |
|
addiu NONCE_0, 1 |
|
.set reorder |
|
|
|
.Lchacha_mips_xor_bytes: |
|
addu IN, $at |
|
addu OUT, $at |
|
/* First byte */ |
|
lbu T1, 0(IN) |
|
addiu $at, BYTES, 1 |
|
CPU_TO_LE32(SAVED_X) |
|
ROTR(SAVED_X) |
|
xor T1, SAVED_X |
|
sb T1, 0(OUT) |
|
beqz $at, .Lchacha_mips_xor_done |
|
/* Second byte */ |
|
lbu T1, 1(IN) |
|
addiu $at, BYTES, 2 |
|
ROTx SAVED_X, 8 |
|
xor T1, SAVED_X |
|
sb T1, 1(OUT) |
|
beqz $at, .Lchacha_mips_xor_done |
|
/* Third byte */ |
|
lbu T1, 2(IN) |
|
ROTx SAVED_X, 8 |
|
xor T1, SAVED_X |
|
sb T1, 2(OUT) |
|
b .Lchacha_mips_xor_done |
|
|
|
.Lchacha_mips_no_full_block_unaligned: |
|
/* Restore the offset on BYTES */ |
|
addiu BYTES, CHACHA20_BLOCK_SIZE |
|
|
|
/* Get number of full WORDS */ |
|
andi $at, BYTES, MASK_U32 |
|
|
|
/* Load upper half of jump table addr */ |
|
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) |
|
|
|
/* Calculate lower half jump table offset */ |
|
ins T0, $at, 1, 6 |
|
|
|
/* Add offset to STATE */ |
|
addu T1, STATE, $at |
|
|
|
/* Add lower half jump table addr */ |
|
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) |
|
|
|
/* Read value from STATE */ |
|
lw SAVED_CA, 0(T1) |
|
|
|
/* Store remaining bytecounter as negative value */ |
|
subu BYTES, $at, BYTES |
|
|
|
jr T0 |
|
|
|
/* Jump table */ |
|
FOR_EACH_WORD(JMPTBL_UNALIGNED) |
|
.end chacha_crypt_arch |
|
.set at |
|
|
|
/* Input arguments |
|
* STATE $a0 |
|
* OUT $a1 |
|
* NROUND $a2 |
|
*/ |
|
|
|
#undef X12 |
|
#undef X13 |
|
#undef X14 |
|
#undef X15 |
|
|
|
#define X12 $a3 |
|
#define X13 $at |
|
#define X14 $v0 |
|
#define X15 STATE |
|
|
|
.set noat |
|
.globl hchacha_block_arch |
|
.ent hchacha_block_arch |
|
hchacha_block_arch: |
|
.frame $sp, STACK_SIZE, $ra |
|
|
|
addiu $sp, -STACK_SIZE |
|
|
|
/* Save X11(s6) */ |
|
sw X11, 0($sp) |
|
|
|
lw X0, 0(STATE) |
|
lw X1, 4(STATE) |
|
lw X2, 8(STATE) |
|
lw X3, 12(STATE) |
|
lw X4, 16(STATE) |
|
lw X5, 20(STATE) |
|
lw X6, 24(STATE) |
|
lw X7, 28(STATE) |
|
lw X8, 32(STATE) |
|
lw X9, 36(STATE) |
|
lw X10, 40(STATE) |
|
lw X11, 44(STATE) |
|
lw X12, 48(STATE) |
|
lw X13, 52(STATE) |
|
lw X14, 56(STATE) |
|
lw X15, 60(STATE) |
|
|
|
.Loop_hchacha_xor_rounds: |
|
addiu $a2, -2 |
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); |
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); |
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); |
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); |
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); |
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); |
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); |
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); |
|
bnez $a2, .Loop_hchacha_xor_rounds |
|
|
|
/* Restore used register */ |
|
lw X11, 0($sp) |
|
|
|
sw X0, 0(OUT) |
|
sw X1, 4(OUT) |
|
sw X2, 8(OUT) |
|
sw X3, 12(OUT) |
|
sw X12, 16(OUT) |
|
sw X13, 20(OUT) |
|
sw X14, 24(OUT) |
|
sw X15, 28(OUT) |
|
|
|
addiu $sp, STACK_SIZE |
|
jr $ra |
|
.end hchacha_block_arch |
|
.set at
|
|
|