mirror of https://github.com/Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
294 lines
9.8 KiB
294 lines
9.8 KiB
/* SPDX-License-Identifier: GPL-2.0-or-later */ |
|
/* |
|
* Fast SHA-1 implementation for SPE instruction set (PPC) |
|
* |
|
* This code makes use of the SPE SIMD instruction set as defined in |
|
* http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf |
|
* Implementation is based on optimization guide notes from |
|
* http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf |
|
* |
|
* Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> |
|
*/ |
|
|
|
#include <asm/ppc_asm.h> |
|
#include <asm/asm-offsets.h> |
|
|
|
#define rHP r3 /* pointer to hash value */ |
|
#define rWP r4 /* pointer to input */ |
|
#define rKP r5 /* pointer to constants */ |
|
|
|
#define rW0 r14 /* 64 bit round words */ |
|
#define rW1 r15 |
|
#define rW2 r16 |
|
#define rW3 r17 |
|
#define rW4 r18 |
|
#define rW5 r19 |
|
#define rW6 r20 |
|
#define rW7 r21 |
|
|
|
#define rH0 r6 /* 32 bit hash values */ |
|
#define rH1 r7 |
|
#define rH2 r8 |
|
#define rH3 r9 |
|
#define rH4 r10 |
|
|
|
#define rT0 r22 /* 64 bit temporary */ |
|
#define rT1 r0 /* 32 bit temporaries */ |
|
#define rT2 r11 |
|
#define rT3 r12 |
|
|
|
#define rK r23 /* 64 bit constant in volatile register */ |
|
|
|
#define LOAD_K01 |
|
|
|
#define LOAD_K11 \ |
|
evlwwsplat rK,0(rKP); |
|
|
|
#define LOAD_K21 \ |
|
evlwwsplat rK,4(rKP); |
|
|
|
#define LOAD_K31 \ |
|
evlwwsplat rK,8(rKP); |
|
|
|
#define LOAD_K41 \ |
|
evlwwsplat rK,12(rKP); |
|
|
|
#define INITIALIZE \ |
|
stwu r1,-128(r1); /* create stack frame */ \ |
|
evstdw r14,8(r1); /* We must save non volatile */ \ |
|
evstdw r15,16(r1); /* registers. Take the chance */ \ |
|
evstdw r16,24(r1); /* and save the SPE part too */ \ |
|
evstdw r17,32(r1); \ |
|
evstdw r18,40(r1); \ |
|
evstdw r19,48(r1); \ |
|
evstdw r20,56(r1); \ |
|
evstdw r21,64(r1); \ |
|
evstdw r22,72(r1); \ |
|
evstdw r23,80(r1); |
|
|
|
|
|
#define FINALIZE \ |
|
evldw r14,8(r1); /* restore SPE registers */ \ |
|
evldw r15,16(r1); \ |
|
evldw r16,24(r1); \ |
|
evldw r17,32(r1); \ |
|
evldw r18,40(r1); \ |
|
evldw r19,48(r1); \ |
|
evldw r20,56(r1); \ |
|
evldw r21,64(r1); \ |
|
evldw r22,72(r1); \ |
|
evldw r23,80(r1); \ |
|
xor r0,r0,r0; \ |
|
stw r0,8(r1); /* Delete sensitive data */ \ |
|
stw r0,16(r1); /* that we might have pushed */ \ |
|
stw r0,24(r1); /* from other context that runs */ \ |
|
stw r0,32(r1); /* the same code. Assume that */ \ |
|
stw r0,40(r1); /* the lower part of the GPRs */ \ |
|
stw r0,48(r1); /* were already overwritten on */ \ |
|
stw r0,56(r1); /* the way down to here */ \ |
|
stw r0,64(r1); \ |
|
stw r0,72(r1); \ |
|
stw r0,80(r1); \ |
|
addi r1,r1,128; /* cleanup stack frame */ |
|
|
|
#ifdef __BIG_ENDIAN__ |
|
#define LOAD_DATA(reg, off) \ |
|
lwz reg,off(rWP); /* load data */ |
|
#define NEXT_BLOCK \ |
|
addi rWP,rWP,64; /* increment per block */ |
|
#else |
|
#define LOAD_DATA(reg, off) \ |
|
lwbrx reg,0,rWP; /* load data */ \ |
|
addi rWP,rWP,4; /* increment per word */ |
|
#define NEXT_BLOCK /* nothing to do */ |
|
#endif |
|
|
|
#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ |
|
LOAD_DATA(w0, off) /* 1: W */ \ |
|
and rT2,b,c; /* 1: F' = B and C */ \ |
|
LOAD_K##k##1 \ |
|
andc rT1,d,b; /* 1: F" = ~B and D */ \ |
|
rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ |
|
or rT2,rT2,rT1; /* 1: F = F' or F" */ \ |
|
add e,e,rT0; /* 1: E = E + A' */ \ |
|
rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
|
add e,e,w0; /* 1: E = E + W */ \ |
|
LOAD_DATA(w1, off+4) /* 2: W */ \ |
|
add e,e,rT2; /* 1: E = E + F */ \ |
|
and rT1,a,b; /* 2: F' = B and C */ \ |
|
add e,e,rK; /* 1: E = E + K */ \ |
|
andc rT2,c,a; /* 2: F" = ~B and D */ \ |
|
add d,d,rK; /* 2: E = E + K */ \ |
|
or rT2,rT2,rT1; /* 2: F = F' or F" */ \ |
|
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
|
add d,d,w1; /* 2: E = E + W */ \ |
|
rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
|
add d,d,rT0; /* 2: E = E + A' */ \ |
|
evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ |
|
add d,d,rT2 /* 2: E = E + F */ |
|
|
|
#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
|
and rT2,b,c; /* 1: F' = B and C */ \ |
|
evmergelohi rT0,w7,w6; /* W[-3] */ \ |
|
andc rT1,d,b; /* 1: F" = ~B and D */ \ |
|
evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
|
or rT1,rT1,rT2; /* 1: F = F' or F" */ \ |
|
evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
|
add e,e,rT1; /* 1: E = E + F */ \ |
|
evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
|
rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
|
evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
|
add e,e,rT2; /* 1: E = E + A' */ \ |
|
evaddw rT0,w0,rK; /* WK = W + K */ \ |
|
rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
|
LOAD_K##k##1 \ |
|
evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
|
add e,e,rT0; /* 1: E = E + WK */ \ |
|
add d,d,rT1; /* 2: E = E + WK */ \ |
|
and rT2,a,b; /* 2: F' = B and C */ \ |
|
andc rT1,c,a; /* 2: F" = ~B and D */ \ |
|
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
|
or rT1,rT1,rT2; /* 2: F = F' or F" */ \ |
|
add d,d,rT0; /* 2: E = E + A' */ \ |
|
rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
|
add d,d,rT1 /* 2: E = E + F */ |
|
|
|
#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
|
evmergelohi rT0,w7,w6; /* W[-3] */ \ |
|
xor rT2,b,c; /* 1: F' = B xor C */ \ |
|
evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
|
xor rT2,rT2,d; /* 1: F = F' xor D */ \ |
|
evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
|
add e,e,rT2; /* 1: E = E + F */ \ |
|
evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
|
rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
|
evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
|
add e,e,rT2; /* 1: E = E + A' */ \ |
|
evaddw rT0,w0,rK; /* WK = W + K */ \ |
|
rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
|
LOAD_K##k##1 \ |
|
evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
|
add e,e,rT0; /* 1: E = E + WK */ \ |
|
xor rT2,a,b; /* 2: F' = B xor C */ \ |
|
add d,d,rT1; /* 2: E = E + WK */ \ |
|
xor rT2,rT2,c; /* 2: F = F' xor D */ \ |
|
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
|
add d,d,rT2; /* 2: E = E + F */ \ |
|
rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
|
add d,d,rT0 /* 2: E = E + A' */ |
|
|
|
#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
|
and rT2,b,c; /* 1: F' = B and C */ \ |
|
evmergelohi rT0,w7,w6; /* W[-3] */ \ |
|
or rT1,b,c; /* 1: F" = B or C */ \ |
|
evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ |
|
and rT1,d,rT1; /* 1: F" = F" and D */ \ |
|
evxor w0,w0,w4; /* W = W xor W[-8] */ \ |
|
or rT2,rT2,rT1; /* 1: F = F' or F" */ \ |
|
evxor w0,w0,w1; /* W = W xor W[-14] */ \ |
|
add e,e,rT2; /* 1: E = E + F */ \ |
|
evrlwi w0,w0,1; /* W = W rotl 1 */ \ |
|
rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ |
|
evaddw rT0,w0,rK; /* WK = W + K */ \ |
|
add e,e,rT2; /* 1: E = E + A' */ \ |
|
LOAD_K##k##1 \ |
|
evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ |
|
rotrwi b,b,2; /* 1: B = B rotl 30 */ \ |
|
add e,e,rT0; /* 1: E = E + WK */ \ |
|
and rT2,a,b; /* 2: F' = B and C */ \ |
|
or rT0,a,b; /* 2: F" = B or C */ \ |
|
add d,d,rT1; /* 2: E = E + WK */ \ |
|
and rT0,c,rT0; /* 2: F" = F" and D */ \ |
|
rotrwi a,a,2; /* 2: B = B rotl 30 */ \ |
|
or rT2,rT2,rT0; /* 2: F = F' or F" */ \ |
|
rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ |
|
add d,d,rT2; /* 2: E = E + F */ \ |
|
add d,d,rT0 /* 2: E = E + A' */ |
|
|
|
#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ |
|
R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) |
|
|
|
_GLOBAL(ppc_spe_sha1_transform) |
|
INITIALIZE |
|
|
|
lwz rH0,0(rHP) |
|
lwz rH1,4(rHP) |
|
mtctr r5 |
|
lwz rH2,8(rHP) |
|
lis rKP,PPC_SPE_SHA1_K@h |
|
lwz rH3,12(rHP) |
|
ori rKP,rKP,PPC_SPE_SHA1_K@l |
|
lwz rH4,16(rHP) |
|
|
|
ppc_spe_sha1_main: |
|
R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) |
|
R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) |
|
R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) |
|
R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) |
|
R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) |
|
R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) |
|
R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) |
|
R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) |
|
|
|
R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) |
|
R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) |
|
|
|
R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) |
|
R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) |
|
R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) |
|
R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) |
|
R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) |
|
R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) |
|
R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) |
|
R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) |
|
R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) |
|
R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) |
|
|
|
R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) |
|
R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) |
|
R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) |
|
R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) |
|
R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) |
|
R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) |
|
R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) |
|
R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) |
|
R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) |
|
R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) |
|
|
|
R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) |
|
R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) |
|
R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) |
|
R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) |
|
R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) |
|
R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) |
|
R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) |
|
lwz rT3,0(rHP) |
|
R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) |
|
lwz rW1,4(rHP) |
|
R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) |
|
lwz rW2,8(rHP) |
|
R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) |
|
lwz rW3,12(rHP) |
|
NEXT_BLOCK |
|
lwz rW4,16(rHP) |
|
|
|
add rH0,rH0,rT3 |
|
stw rH0,0(rHP) |
|
add rH1,rH1,rW1 |
|
stw rH1,4(rHP) |
|
add rH2,rH2,rW2 |
|
stw rH2,8(rHP) |
|
add rH3,rH3,rW3 |
|
stw rH3,12(rHP) |
|
add rH4,rH4,rW4 |
|
stw rH4,16(rHP) |
|
|
|
bdnz ppc_spe_sha1_main |
|
|
|
FINALIZE |
|
blr |
|
|
|
.data |
|
.align 4 |
|
PPC_SPE_SHA1_K: |
|
.long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6
|
|
|