forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
638 lines
11 KiB
638 lines
11 KiB
/* SPDX-License-Identifier: GPL-2.0-or-later */ |
|
/* |
|
* Author: Anton Blanchard <anton@au.ibm.com> |
|
* Copyright 2015 IBM Corporation. |
|
*/ |
|
#include <asm/ppc_asm.h> |
|
#include <asm/export.h> |
|
#include <asm/ppc-opcode.h> |
|
|
|
#define off8 r6 |
|
#define off16 r7 |
|
#define off24 r8 |
|
|
|
#define rA r9 |
|
#define rB r10 |
|
#define rC r11 |
|
#define rD r27 |
|
#define rE r28 |
|
#define rF r29 |
|
#define rG r30 |
|
#define rH r31 |
|
|
|
#ifdef __LITTLE_ENDIAN__ |
|
#define LH lhbrx |
|
#define LW lwbrx |
|
#define LD ldbrx |
|
#define LVS lvsr |
|
#define VPERM(_VRT,_VRA,_VRB,_VRC) \ |
|
vperm _VRT,_VRB,_VRA,_VRC |
|
#else |
|
#define LH lhzx |
|
#define LW lwzx |
|
#define LD ldx |
|
#define LVS lvsl |
|
#define VPERM(_VRT,_VRA,_VRB,_VRC) \ |
|
vperm _VRT,_VRA,_VRB,_VRC |
|
#endif |
|
|
|
#define VMX_THRESH 4096 |
|
#define ENTER_VMX_OPS \ |
|
mflr r0; \ |
|
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ |
|
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ |
|
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ |
|
std r0,16(r1); \ |
|
stdu r1,-STACKFRAMESIZE(r1); \ |
|
bl enter_vmx_ops; \ |
|
cmpwi cr1,r3,0; \ |
|
ld r0,STACKFRAMESIZE+16(r1); \ |
|
ld r3,STK_REG(R31)(r1); \ |
|
ld r4,STK_REG(R30)(r1); \ |
|
ld r5,STK_REG(R29)(r1); \ |
|
addi r1,r1,STACKFRAMESIZE; \ |
|
mtlr r0 |
|
|
|
#define EXIT_VMX_OPS \ |
|
mflr r0; \ |
|
std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ |
|
std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ |
|
std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ |
|
std r0,16(r1); \ |
|
stdu r1,-STACKFRAMESIZE(r1); \ |
|
bl exit_vmx_ops; \ |
|
ld r0,STACKFRAMESIZE+16(r1); \ |
|
ld r3,STK_REG(R31)(r1); \ |
|
ld r4,STK_REG(R30)(r1); \ |
|
ld r5,STK_REG(R29)(r1); \ |
|
addi r1,r1,STACKFRAMESIZE; \ |
|
mtlr r0 |
|
|
|
/* |
|
* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with |
|
* 16 bytes boundary and permute the result with the 1st 16 bytes. |
|
|
|
* | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | |
|
* ^ ^ ^ |
|
* 0xbbbb10 0xbbbb20 0xbbb30 |
|
* ^ |
|
* _vaddr |
|
* |
|
* |
|
* _vmask is the mask generated by LVS |
|
* _v1st_qw is the 1st aligned QW of current addr which is already loaded. |
|
* for example: 0xyyyyyyyyyyyyy012 for big endian |
|
* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. |
|
* for example: 0x3456789abcdefzzz for big endian |
|
* The permute result is saved in _v_res. |
|
* for example: 0x0123456789abcdef for big endian. |
|
*/ |
|
#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ |
|
lvx _v2nd_qw,_vaddr,off16; \ |
|
VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) |
|
|
|
/* |
|
* There are 2 categories for memcmp: |
|
* 1) src/dst has the same offset to the 8 bytes boundary. The handlers |
|
* are named like .Lsameoffset_xxxx |
|
* 2) src/dst has different offset to the 8 bytes boundary. The handlers |
|
* are named like .Ldiffoffset_xxxx |
|
*/ |
|
_GLOBAL_TOC(memcmp) |
|
cmpdi cr1,r5,0 |
|
|
|
/* Use the short loop if the src/dst addresses are not |
|
* with the same offset of 8 bytes align boundary. |
|
*/ |
|
xor r6,r3,r4 |
|
andi. r6,r6,7 |
|
|
|
/* Fall back to short loop if compare at aligned addrs |
|
* with less than 8 bytes. |
|
*/ |
|
cmpdi cr6,r5,7 |
|
|
|
beq cr1,.Lzero |
|
bgt cr6,.Lno_short |
|
|
|
.Lshort: |
|
mtctr r5 |
|
1: lbz rA,0(r3) |
|
lbz rB,0(r4) |
|
subf. rC,rB,rA |
|
bne .Lnon_zero |
|
bdz .Lzero |
|
|
|
lbz rA,1(r3) |
|
lbz rB,1(r4) |
|
subf. rC,rB,rA |
|
bne .Lnon_zero |
|
bdz .Lzero |
|
|
|
lbz rA,2(r3) |
|
lbz rB,2(r4) |
|
subf. rC,rB,rA |
|
bne .Lnon_zero |
|
bdz .Lzero |
|
|
|
lbz rA,3(r3) |
|
lbz rB,3(r4) |
|
subf. rC,rB,rA |
|
bne .Lnon_zero |
|
|
|
addi r3,r3,4 |
|
addi r4,r4,4 |
|
|
|
bdnz 1b |
|
|
|
.Lzero: |
|
li r3,0 |
|
blr |
|
|
|
.Lno_short: |
|
dcbt 0,r3 |
|
dcbt 0,r4 |
|
bne .Ldiffoffset_8bytes_make_align_start |
|
|
|
|
|
.Lsameoffset_8bytes_make_align_start: |
|
/* attempt to compare bytes not aligned with 8 bytes so that |
|
* rest comparison can run based on 8 bytes alignment. |
|
*/ |
|
andi. r6,r3,7 |
|
|
|
/* Try to compare the first double word which is not 8 bytes aligned: |
|
* load the first double word at (src & ~7UL) and shift left appropriate |
|
* bits before comparision. |
|
*/ |
|
rlwinm r6,r3,3,26,28 |
|
beq .Lsameoffset_8bytes_aligned |
|
clrrdi r3,r3,3 |
|
clrrdi r4,r4,3 |
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
sld rA,rA,r6 |
|
sld rB,rB,r6 |
|
cmpld cr0,rA,rB |
|
srwi r6,r6,3 |
|
bne cr0,.LcmpAB_lightweight |
|
subfic r6,r6,8 |
|
subf. r5,r6,r5 |
|
addi r3,r3,8 |
|
addi r4,r4,8 |
|
beq .Lzero |
|
|
|
.Lsameoffset_8bytes_aligned: |
|
/* now we are aligned with 8 bytes. |
|
* Use .Llong loop if left cmp bytes are equal or greater than 32B. |
|
*/ |
|
cmpdi cr6,r5,31 |
|
bgt cr6,.Llong |
|
|
|
.Lcmp_lt32bytes: |
|
/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ |
|
cmpdi cr5,r5,7 |
|
srdi r0,r5,3 |
|
ble cr5,.Lcmp_rest_lt8bytes |
|
|
|
/* handle 8 ~ 31 bytes */ |
|
clrldi r5,r5,61 |
|
mtctr r0 |
|
2: |
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
cmpld cr0,rA,rB |
|
addi r3,r3,8 |
|
addi r4,r4,8 |
|
bne cr0,.LcmpAB_lightweight |
|
bdnz 2b |
|
|
|
cmpwi r5,0 |
|
beq .Lzero |
|
|
|
.Lcmp_rest_lt8bytes: |
|
/* |
|
* Here we have less than 8 bytes to compare. At least s1 is aligned to |
|
* 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a |
|
* page boundary, otherwise we might read past the end of the buffer and |
|
* trigger a page fault. We use 4K as the conservative minimum page |
|
* size. If we detect that case we go to the byte-by-byte loop. |
|
* |
|
* Otherwise the next double word is loaded from s1 and s2, and shifted |
|
* right to compare the appropriate bits. |
|
*/ |
|
clrldi r6,r4,(64-12) // r6 = r4 & 0xfff |
|
cmpdi r6,0xff8 |
|
bgt .Lshort |
|
|
|
subfic r6,r5,8 |
|
slwi r6,r6,3 |
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
srd rA,rA,r6 |
|
srd rB,rB,r6 |
|
cmpld cr0,rA,rB |
|
bne cr0,.LcmpAB_lightweight |
|
b .Lzero |
|
|
|
.Lnon_zero: |
|
mr r3,rC |
|
blr |
|
|
|
.Llong: |
|
#ifdef CONFIG_ALTIVEC |
|
BEGIN_FTR_SECTION |
|
/* Try to use vmx loop if length is equal or greater than 4K */ |
|
cmpldi cr6,r5,VMX_THRESH |
|
bge cr6,.Lsameoffset_vmx_cmp |
|
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) |
|
|
|
.Llong_novmx_cmp: |
|
#endif |
|
/* At least s1 addr is aligned with 8 bytes */ |
|
li off8,8 |
|
li off16,16 |
|
li off24,24 |
|
|
|
std r31,-8(r1) |
|
std r30,-16(r1) |
|
std r29,-24(r1) |
|
std r28,-32(r1) |
|
std r27,-40(r1) |
|
|
|
srdi r0,r5,5 |
|
mtctr r0 |
|
andi. r5,r5,31 |
|
|
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
|
|
LD rC,off8,r3 |
|
LD rD,off8,r4 |
|
|
|
LD rE,off16,r3 |
|
LD rF,off16,r4 |
|
|
|
LD rG,off24,r3 |
|
LD rH,off24,r4 |
|
cmpld cr0,rA,rB |
|
|
|
addi r3,r3,32 |
|
addi r4,r4,32 |
|
|
|
bdz .Lfirst32 |
|
|
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
cmpld cr1,rC,rD |
|
|
|
LD rC,off8,r3 |
|
LD rD,off8,r4 |
|
cmpld cr6,rE,rF |
|
|
|
LD rE,off16,r3 |
|
LD rF,off16,r4 |
|
cmpld cr7,rG,rH |
|
bne cr0,.LcmpAB |
|
|
|
LD rG,off24,r3 |
|
LD rH,off24,r4 |
|
cmpld cr0,rA,rB |
|
bne cr1,.LcmpCD |
|
|
|
addi r3,r3,32 |
|
addi r4,r4,32 |
|
|
|
bdz .Lsecond32 |
|
|
|
.balign 16 |
|
|
|
1: LD rA,0,r3 |
|
LD rB,0,r4 |
|
cmpld cr1,rC,rD |
|
bne cr6,.LcmpEF |
|
|
|
LD rC,off8,r3 |
|
LD rD,off8,r4 |
|
cmpld cr6,rE,rF |
|
bne cr7,.LcmpGH |
|
|
|
LD rE,off16,r3 |
|
LD rF,off16,r4 |
|
cmpld cr7,rG,rH |
|
bne cr0,.LcmpAB |
|
|
|
LD rG,off24,r3 |
|
LD rH,off24,r4 |
|
cmpld cr0,rA,rB |
|
bne cr1,.LcmpCD |
|
|
|
addi r3,r3,32 |
|
addi r4,r4,32 |
|
|
|
bdnz 1b |
|
|
|
.Lsecond32: |
|
cmpld cr1,rC,rD |
|
bne cr6,.LcmpEF |
|
|
|
cmpld cr6,rE,rF |
|
bne cr7,.LcmpGH |
|
|
|
cmpld cr7,rG,rH |
|
bne cr0,.LcmpAB |
|
|
|
bne cr1,.LcmpCD |
|
bne cr6,.LcmpEF |
|
bne cr7,.LcmpGH |
|
|
|
.Ltail: |
|
ld r31,-8(r1) |
|
ld r30,-16(r1) |
|
ld r29,-24(r1) |
|
ld r28,-32(r1) |
|
ld r27,-40(r1) |
|
|
|
cmpdi r5,0 |
|
beq .Lzero |
|
b .Lshort |
|
|
|
.Lfirst32: |
|
cmpld cr1,rC,rD |
|
cmpld cr6,rE,rF |
|
cmpld cr7,rG,rH |
|
|
|
bne cr0,.LcmpAB |
|
bne cr1,.LcmpCD |
|
bne cr6,.LcmpEF |
|
bne cr7,.LcmpGH |
|
|
|
b .Ltail |
|
|
|
.LcmpAB: |
|
li r3,1 |
|
bgt cr0,.Lout |
|
li r3,-1 |
|
b .Lout |
|
|
|
.LcmpCD: |
|
li r3,1 |
|
bgt cr1,.Lout |
|
li r3,-1 |
|
b .Lout |
|
|
|
.LcmpEF: |
|
li r3,1 |
|
bgt cr6,.Lout |
|
li r3,-1 |
|
b .Lout |
|
|
|
.LcmpGH: |
|
li r3,1 |
|
bgt cr7,.Lout |
|
li r3,-1 |
|
|
|
.Lout: |
|
ld r31,-8(r1) |
|
ld r30,-16(r1) |
|
ld r29,-24(r1) |
|
ld r28,-32(r1) |
|
ld r27,-40(r1) |
|
blr |
|
|
|
.LcmpAB_lightweight: /* skip NV GPRS restore */ |
|
li r3,1 |
|
bgtlr |
|
li r3,-1 |
|
blr |
|
|
|
#ifdef CONFIG_ALTIVEC |
|
.Lsameoffset_vmx_cmp: |
|
/* Enter with src/dst addrs has the same offset with 8 bytes |
|
* align boundary. |
|
* |
|
* There is an optimization based on following fact: memcmp() |
|
* prones to fail early at the first 32 bytes. |
|
* Before applying VMX instructions which will lead to 32x128bits |
|
* VMX regs load/restore penalty, we compare the first 32 bytes |
|
* so that we can catch the ~80% fail cases. |
|
*/ |
|
|
|
li r0,4 |
|
mtctr r0 |
|
.Lsameoffset_prechk_32B_loop: |
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
cmpld cr0,rA,rB |
|
addi r3,r3,8 |
|
addi r4,r4,8 |
|
bne cr0,.LcmpAB_lightweight |
|
addi r5,r5,-8 |
|
bdnz .Lsameoffset_prechk_32B_loop |
|
|
|
ENTER_VMX_OPS |
|
beq cr1,.Llong_novmx_cmp |
|
|
|
3: |
|
/* need to check whether r4 has the same offset with r3 |
|
* for 16 bytes boundary. |
|
*/ |
|
xor r0,r3,r4 |
|
andi. r0,r0,0xf |
|
bne .Ldiffoffset_vmx_cmp_start |
|
|
|
/* len is no less than 4KB. Need to align with 16 bytes further. |
|
*/ |
|
andi. rA,r3,8 |
|
LD rA,0,r3 |
|
beq 4f |
|
LD rB,0,r4 |
|
cmpld cr0,rA,rB |
|
addi r3,r3,8 |
|
addi r4,r4,8 |
|
addi r5,r5,-8 |
|
|
|
beq cr0,4f |
|
/* save and restore cr0 */ |
|
mfocrf r5,128 |
|
EXIT_VMX_OPS |
|
mtocrf 128,r5 |
|
b .LcmpAB_lightweight |
|
|
|
4: |
|
/* compare 32 bytes for each loop */ |
|
srdi r0,r5,5 |
|
mtctr r0 |
|
clrldi r5,r5,59 |
|
li off16,16 |
|
|
|
.balign 16 |
|
5: |
|
lvx v0,0,r3 |
|
lvx v1,0,r4 |
|
VCMPEQUD_RC(v0,v0,v1) |
|
bnl cr6,7f |
|
lvx v0,off16,r3 |
|
lvx v1,off16,r4 |
|
VCMPEQUD_RC(v0,v0,v1) |
|
bnl cr6,6f |
|
addi r3,r3,32 |
|
addi r4,r4,32 |
|
bdnz 5b |
|
|
|
EXIT_VMX_OPS |
|
cmpdi r5,0 |
|
beq .Lzero |
|
b .Lcmp_lt32bytes |
|
|
|
6: |
|
addi r3,r3,16 |
|
addi r4,r4,16 |
|
|
|
7: |
|
/* diff the last 16 bytes */ |
|
EXIT_VMX_OPS |
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
cmpld cr0,rA,rB |
|
li off8,8 |
|
bne cr0,.LcmpAB_lightweight |
|
|
|
LD rA,off8,r3 |
|
LD rB,off8,r4 |
|
cmpld cr0,rA,rB |
|
bne cr0,.LcmpAB_lightweight |
|
b .Lzero |
|
#endif |
|
|
|
.Ldiffoffset_8bytes_make_align_start: |
|
/* now try to align s1 with 8 bytes */ |
|
rlwinm r6,r3,3,26,28 |
|
beq .Ldiffoffset_align_s1_8bytes |
|
|
|
clrrdi r3,r3,3 |
|
LD rA,0,r3 |
|
LD rB,0,r4 /* unaligned load */ |
|
sld rA,rA,r6 |
|
srd rA,rA,r6 |
|
srd rB,rB,r6 |
|
cmpld cr0,rA,rB |
|
srwi r6,r6,3 |
|
bne cr0,.LcmpAB_lightweight |
|
|
|
subfic r6,r6,8 |
|
subf. r5,r6,r5 |
|
addi r3,r3,8 |
|
add r4,r4,r6 |
|
|
|
beq .Lzero |
|
|
|
.Ldiffoffset_align_s1_8bytes: |
|
/* now s1 is aligned with 8 bytes. */ |
|
#ifdef CONFIG_ALTIVEC |
|
BEGIN_FTR_SECTION |
|
/* only do vmx ops when the size equal or greater than 4K bytes */ |
|
cmpdi cr5,r5,VMX_THRESH |
|
bge cr5,.Ldiffoffset_vmx_cmp |
|
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) |
|
|
|
.Ldiffoffset_novmx_cmp: |
|
#endif |
|
|
|
|
|
cmpdi cr5,r5,31 |
|
ble cr5,.Lcmp_lt32bytes |
|
|
|
#ifdef CONFIG_ALTIVEC |
|
b .Llong_novmx_cmp |
|
#else |
|
b .Llong |
|
#endif |
|
|
|
#ifdef CONFIG_ALTIVEC |
|
.Ldiffoffset_vmx_cmp: |
|
/* perform a 32 bytes pre-checking before |
|
* enable VMX operations. |
|
*/ |
|
li r0,4 |
|
mtctr r0 |
|
.Ldiffoffset_prechk_32B_loop: |
|
LD rA,0,r3 |
|
LD rB,0,r4 |
|
cmpld cr0,rA,rB |
|
addi r3,r3,8 |
|
addi r4,r4,8 |
|
bne cr0,.LcmpAB_lightweight |
|
addi r5,r5,-8 |
|
bdnz .Ldiffoffset_prechk_32B_loop |
|
|
|
ENTER_VMX_OPS |
|
beq cr1,.Ldiffoffset_novmx_cmp |
|
|
|
.Ldiffoffset_vmx_cmp_start: |
|
/* Firstly try to align r3 with 16 bytes */ |
|
andi. r6,r3,0xf |
|
li off16,16 |
|
beq .Ldiffoffset_vmx_s1_16bytes_align |
|
|
|
LVS v3,0,r3 |
|
LVS v4,0,r4 |
|
|
|
lvx v5,0,r3 |
|
lvx v6,0,r4 |
|
LD_VSR_CROSS16B(r3,v3,v5,v7,v9) |
|
LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
|
|
|
VCMPEQUB_RC(v7,v9,v10) |
|
bnl cr6,.Ldiffoffset_vmx_diff_found |
|
|
|
subfic r6,r6,16 |
|
subf r5,r6,r5 |
|
add r3,r3,r6 |
|
add r4,r4,r6 |
|
|
|
.Ldiffoffset_vmx_s1_16bytes_align: |
|
/* now s1 is aligned with 16 bytes */ |
|
lvx v6,0,r4 |
|
LVS v4,0,r4 |
|
srdi r6,r5,5 /* loop for 32 bytes each */ |
|
clrldi r5,r5,59 |
|
mtctr r6 |
|
|
|
.balign 16 |
|
.Ldiffoffset_vmx_32bytesloop: |
|
/* the first qw of r4 was saved in v6 */ |
|
lvx v9,0,r3 |
|
LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
|
VCMPEQUB_RC(v7,v9,v10) |
|
vor v6,v8,v8 |
|
bnl cr6,.Ldiffoffset_vmx_diff_found |
|
|
|
addi r3,r3,16 |
|
addi r4,r4,16 |
|
|
|
lvx v9,0,r3 |
|
LD_VSR_CROSS16B(r4,v4,v6,v8,v10) |
|
VCMPEQUB_RC(v7,v9,v10) |
|
vor v6,v8,v8 |
|
bnl cr6,.Ldiffoffset_vmx_diff_found |
|
|
|
addi r3,r3,16 |
|
addi r4,r4,16 |
|
|
|
bdnz .Ldiffoffset_vmx_32bytesloop |
|
|
|
EXIT_VMX_OPS |
|
|
|
cmpdi r5,0 |
|
beq .Lzero |
|
b .Lcmp_lt32bytes |
|
|
|
.Ldiffoffset_vmx_diff_found: |
|
EXIT_VMX_OPS |
|
/* anyway, the diff will appear in next 16 bytes */ |
|
li r5,16 |
|
b .Lcmp_lt32bytes |
|
|
|
#endif |
|
EXPORT_SYMBOL(memcmp)
|
|
|