forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
253 lines
5.8 KiB
253 lines
5.8 KiB
/* SPDX-License-Identifier: GPL-2.0-only */ |
|
/* |
|
* Copyright (c) 2012-2021, Arm Limited. |
|
* |
|
* Adapted from the original at: |
|
* https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S |
|
*/ |
|
|
|
#include <linux/linkage.h> |
|
#include <asm/assembler.h> |
|
|
|
/* Assumptions: |
|
* |
|
* ARMv8-a, AArch64, unaligned accesses. |
|
* |
|
*/ |
|
|
|
#define L(label) .L ## label |
|
|
|
#define dstin x0 |
|
#define src x1 |
|
#define count x2 |
|
#define dst x3 |
|
#define srcend x4 |
|
#define dstend x5 |
|
#define A_l x6 |
|
#define A_lw w6 |
|
#define A_h x7 |
|
#define B_l x8 |
|
#define B_lw w8 |
|
#define B_h x9 |
|
#define C_l x10 |
|
#define C_lw w10 |
|
#define C_h x11 |
|
#define D_l x12 |
|
#define D_h x13 |
|
#define E_l x14 |
|
#define E_h x15 |
|
#define F_l x16 |
|
#define F_h x17 |
|
#define G_l count |
|
#define G_h dst |
|
#define H_l src |
|
#define H_h srcend |
|
#define tmp1 x14 |
|
|
|
/* This implementation handles overlaps and supports both memcpy and memmove |
|
from a single entry point. It uses unaligned accesses and branchless |
|
sequences to keep the code small, simple and improve performance. |
|
|
|
Copies are split into 3 main cases: small copies of up to 32 bytes, medium |
|
copies of up to 128 bytes, and large copies. The overhead of the overlap |
|
check is negligible since it is only required for large copies. |
|
|
|
Large copies use a software pipelined loop processing 64 bytes per iteration. |
|
The destination pointer is 16-byte aligned to minimize unaligned accesses. |
|
The loop tail is handled by always copying 64 bytes from the end. |
|
*/ |
|
|
|
SYM_FUNC_START(__pi_memcpy) |
|
add srcend, src, count |
|
add dstend, dstin, count |
|
cmp count, 128 |
|
b.hi L(copy_long) |
|
cmp count, 32 |
|
b.hi L(copy32_128) |
|
|
|
/* Small copies: 0..32 bytes. */ |
|
cmp count, 16 |
|
b.lo L(copy16) |
|
ldp A_l, A_h, [src] |
|
ldp D_l, D_h, [srcend, -16] |
|
stp A_l, A_h, [dstin] |
|
stp D_l, D_h, [dstend, -16] |
|
ret |
|
|
|
/* Copy 8-15 bytes. */ |
|
L(copy16): |
|
tbz count, 3, L(copy8) |
|
ldr A_l, [src] |
|
ldr A_h, [srcend, -8] |
|
str A_l, [dstin] |
|
str A_h, [dstend, -8] |
|
ret |
|
|
|
.p2align 3 |
|
/* Copy 4-7 bytes. */ |
|
L(copy8): |
|
tbz count, 2, L(copy4) |
|
ldr A_lw, [src] |
|
ldr B_lw, [srcend, -4] |
|
str A_lw, [dstin] |
|
str B_lw, [dstend, -4] |
|
ret |
|
|
|
/* Copy 0..3 bytes using a branchless sequence. */ |
|
L(copy4): |
|
cbz count, L(copy0) |
|
lsr tmp1, count, 1 |
|
ldrb A_lw, [src] |
|
ldrb C_lw, [srcend, -1] |
|
ldrb B_lw, [src, tmp1] |
|
strb A_lw, [dstin] |
|
strb B_lw, [dstin, tmp1] |
|
strb C_lw, [dstend, -1] |
|
L(copy0): |
|
ret |
|
|
|
.p2align 4 |
|
/* Medium copies: 33..128 bytes. */ |
|
L(copy32_128): |
|
ldp A_l, A_h, [src] |
|
ldp B_l, B_h, [src, 16] |
|
ldp C_l, C_h, [srcend, -32] |
|
ldp D_l, D_h, [srcend, -16] |
|
cmp count, 64 |
|
b.hi L(copy128) |
|
stp A_l, A_h, [dstin] |
|
stp B_l, B_h, [dstin, 16] |
|
stp C_l, C_h, [dstend, -32] |
|
stp D_l, D_h, [dstend, -16] |
|
ret |
|
|
|
.p2align 4 |
|
/* Copy 65..128 bytes. */ |
|
L(copy128): |
|
ldp E_l, E_h, [src, 32] |
|
ldp F_l, F_h, [src, 48] |
|
cmp count, 96 |
|
b.ls L(copy96) |
|
ldp G_l, G_h, [srcend, -64] |
|
ldp H_l, H_h, [srcend, -48] |
|
stp G_l, G_h, [dstend, -64] |
|
stp H_l, H_h, [dstend, -48] |
|
L(copy96): |
|
stp A_l, A_h, [dstin] |
|
stp B_l, B_h, [dstin, 16] |
|
stp E_l, E_h, [dstin, 32] |
|
stp F_l, F_h, [dstin, 48] |
|
stp C_l, C_h, [dstend, -32] |
|
stp D_l, D_h, [dstend, -16] |
|
ret |
|
|
|
.p2align 4 |
|
/* Copy more than 128 bytes. */ |
|
L(copy_long): |
|
/* Use backwards copy if there is an overlap. */ |
|
sub tmp1, dstin, src |
|
cbz tmp1, L(copy0) |
|
cmp tmp1, count |
|
b.lo L(copy_long_backwards) |
|
|
|
/* Copy 16 bytes and then align dst to 16-byte alignment. */ |
|
|
|
ldp D_l, D_h, [src] |
|
and tmp1, dstin, 15 |
|
bic dst, dstin, 15 |
|
sub src, src, tmp1 |
|
add count, count, tmp1 /* Count is now 16 too large. */ |
|
ldp A_l, A_h, [src, 16] |
|
stp D_l, D_h, [dstin] |
|
ldp B_l, B_h, [src, 32] |
|
ldp C_l, C_h, [src, 48] |
|
ldp D_l, D_h, [src, 64]! |
|
subs count, count, 128 + 16 /* Test and readjust count. */ |
|
b.ls L(copy64_from_end) |
|
|
|
L(loop64): |
|
stp A_l, A_h, [dst, 16] |
|
ldp A_l, A_h, [src, 16] |
|
stp B_l, B_h, [dst, 32] |
|
ldp B_l, B_h, [src, 32] |
|
stp C_l, C_h, [dst, 48] |
|
ldp C_l, C_h, [src, 48] |
|
stp D_l, D_h, [dst, 64]! |
|
ldp D_l, D_h, [src, 64]! |
|
subs count, count, 64 |
|
b.hi L(loop64) |
|
|
|
/* Write the last iteration and copy 64 bytes from the end. */ |
|
L(copy64_from_end): |
|
ldp E_l, E_h, [srcend, -64] |
|
stp A_l, A_h, [dst, 16] |
|
ldp A_l, A_h, [srcend, -48] |
|
stp B_l, B_h, [dst, 32] |
|
ldp B_l, B_h, [srcend, -32] |
|
stp C_l, C_h, [dst, 48] |
|
ldp C_l, C_h, [srcend, -16] |
|
stp D_l, D_h, [dst, 64] |
|
stp E_l, E_h, [dstend, -64] |
|
stp A_l, A_h, [dstend, -48] |
|
stp B_l, B_h, [dstend, -32] |
|
stp C_l, C_h, [dstend, -16] |
|
ret |
|
|
|
.p2align 4 |
|
|
|
/* Large backwards copy for overlapping copies. |
|
Copy 16 bytes and then align dst to 16-byte alignment. */ |
|
L(copy_long_backwards): |
|
ldp D_l, D_h, [srcend, -16] |
|
and tmp1, dstend, 15 |
|
sub srcend, srcend, tmp1 |
|
sub count, count, tmp1 |
|
ldp A_l, A_h, [srcend, -16] |
|
stp D_l, D_h, [dstend, -16] |
|
ldp B_l, B_h, [srcend, -32] |
|
ldp C_l, C_h, [srcend, -48] |
|
ldp D_l, D_h, [srcend, -64]! |
|
sub dstend, dstend, tmp1 |
|
subs count, count, 128 |
|
b.ls L(copy64_from_start) |
|
|
|
L(loop64_backwards): |
|
stp A_l, A_h, [dstend, -16] |
|
ldp A_l, A_h, [srcend, -16] |
|
stp B_l, B_h, [dstend, -32] |
|
ldp B_l, B_h, [srcend, -32] |
|
stp C_l, C_h, [dstend, -48] |
|
ldp C_l, C_h, [srcend, -48] |
|
stp D_l, D_h, [dstend, -64]! |
|
ldp D_l, D_h, [srcend, -64]! |
|
subs count, count, 64 |
|
b.hi L(loop64_backwards) |
|
|
|
/* Write the last iteration and copy 64 bytes from the start. */ |
|
L(copy64_from_start): |
|
ldp G_l, G_h, [src, 48] |
|
stp A_l, A_h, [dstend, -16] |
|
ldp A_l, A_h, [src, 32] |
|
stp B_l, B_h, [dstend, -32] |
|
ldp B_l, B_h, [src, 16] |
|
stp C_l, C_h, [dstend, -48] |
|
ldp C_l, C_h, [src] |
|
stp D_l, D_h, [dstend, -64] |
|
stp G_l, G_h, [dstin, 48] |
|
stp A_l, A_h, [dstin, 32] |
|
stp B_l, B_h, [dstin, 16] |
|
stp C_l, C_h, [dstin] |
|
ret |
|
SYM_FUNC_END(__pi_memcpy) |
|
|
|
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy) |
|
EXPORT_SYMBOL(__memcpy) |
|
SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) |
|
EXPORT_SYMBOL(memcpy) |
|
|
|
SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy) |
|
|
|
SYM_FUNC_ALIAS(__memmove, __pi_memmove) |
|
EXPORT_SYMBOL(__memmove) |
|
SYM_FUNC_ALIAS_WEAK(memmove, __memmove) |
|
EXPORT_SYMBOL(memmove)
|
|
|