forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
316 lines
7.8 KiB
316 lines
7.8 KiB
/* SPDX-License-Identifier: GPL-2.0-only */ |
|
/* |
|
* Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> |
|
*/ |
|
|
|
#include <linux/linkage.h> |
|
#include <asm/asm.h> |
|
|
|
SYM_FUNC_START(__memmove) |
|
SYM_FUNC_START_WEAK(memmove) |
|
/* |
|
* Returns |
|
* a0 - dest |
|
* |
|
* Parameters |
|
* a0 - Inclusive first byte of dest |
|
* a1 - Inclusive first byte of src |
|
* a2 - Length of copy n |
|
* |
|
* Because the return matches the parameter register a0, |
|
* we will not clobber or modify that register. |
|
* |
|
* Note: This currently only works on little-endian. |
|
* To port to big-endian, reverse the direction of shifts |
|
* in the 2 misaligned fixup copy loops. |
|
*/ |
|
|
|
/* Return if nothing to do */ |
|
beq a0, a1, return_from_memmove |
|
beqz a2, return_from_memmove |
|
|
|
/* |
|
* Register Uses |
|
* Forward Copy: a1 - Index counter of src |
|
* Reverse Copy: a4 - Index counter of src |
|
* Forward Copy: t3 - Index counter of dest |
|
* Reverse Copy: t4 - Index counter of dest |
|
* Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest |
|
* Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest |
|
* Both Copy Modes: t0 - Link / Temporary for load-store |
|
* Both Copy Modes: t1 - Temporary for load-store |
|
* Both Copy Modes: t2 - Temporary for load-store |
|
* Both Copy Modes: a5 - dest to src alignment offset |
|
* Both Copy Modes: a6 - Shift ammount |
|
* Both Copy Modes: a7 - Inverse Shift ammount |
|
* Both Copy Modes: a2 - Alternate breakpoint for unrolled loops |
|
*/ |
|
|
|
/* |
|
* Solve for some register values now. |
|
* Byte copy does not need t5 or t6. |
|
*/ |
|
mv t3, a0 |
|
add t4, a0, a2 |
|
add a4, a1, a2 |
|
|
|
/* |
|
* Byte copy if copying less than (2 * SZREG) bytes. This can |
|
* cause problems with the bulk copy implementation and is |
|
* small enough not to bother. |
|
*/ |
|
andi t0, a2, -(2 * SZREG) |
|
beqz t0, byte_copy |
|
|
|
/* |
|
* Now solve for t5 and t6. |
|
*/ |
|
andi t5, t3, -SZREG |
|
andi t6, t4, -SZREG |
|
/* |
|
* If dest(Register t3) rounded down to the nearest naturally |
|
* aligned SZREG address, does not equal dest, then add SZREG |
|
* to find the low-bound of SZREG alignment in the dest memory |
|
* region. Note that this could overshoot the dest memory |
|
* region if n is less than SZREG. This is one reason why |
|
* we always byte copy if n is less than SZREG. |
|
* Otherwise, dest is already naturally aligned to SZREG. |
|
*/ |
|
beq t5, t3, 1f |
|
addi t5, t5, SZREG |
|
1: |
|
|
|
/* |
|
* If the dest and src are co-aligned to SZREG, then there is |
|
* no need for the full rigmarole of a full misaligned fixup copy. |
|
* Instead, do a simpler co-aligned copy. |
|
*/ |
|
xor t0, a0, a1 |
|
andi t1, t0, (SZREG - 1) |
|
beqz t1, coaligned_copy |
|
/* Fall through to misaligned fixup copy */ |
|
|
|
misaligned_fixup_copy: |
|
bltu a1, a0, misaligned_fixup_copy_reverse |
|
|
|
misaligned_fixup_copy_forward: |
|
jal t0, byte_copy_until_aligned_forward |
|
|
|
andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ |
|
slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ |
|
sub a5, a1, t3 /* Find the difference between src and dest */ |
|
andi a1, a1, -SZREG /* Align the src pointer */ |
|
addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ |
|
|
|
/* |
|
* Compute The Inverse Shift |
|
* a7 = XLEN - a6 = XLEN + -a6 |
|
* 2s complement negation to find the negative: -a6 = ~a6 + 1 |
|
* Add that to XLEN. XLEN = SZREG * 8. |
|
*/ |
|
not a7, a6 |
|
addi a7, a7, (SZREG * 8 + 1) |
|
|
|
/* |
|
* Fix Misalignment Copy Loop - Forward |
|
* load_val0 = load_ptr[0]; |
|
* do { |
|
* load_val1 = load_ptr[1]; |
|
* store_ptr += 2; |
|
* store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); |
|
* |
|
* if (store_ptr == {a2}) |
|
* break; |
|
* |
|
* load_val0 = load_ptr[2]; |
|
* load_ptr += 2; |
|
* store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); |
|
* |
|
* } while (store_ptr != store_ptr_end); |
|
* store_ptr = store_ptr_end; |
|
*/ |
|
|
|
REG_L t0, (0 * SZREG)(a1) |
|
1: |
|
REG_L t1, (1 * SZREG)(a1) |
|
addi t3, t3, (2 * SZREG) |
|
srl t0, t0, a6 |
|
sll t2, t1, a7 |
|
or t2, t0, t2 |
|
REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) |
|
|
|
beq t3, a2, 2f |
|
|
|
REG_L t0, (2 * SZREG)(a1) |
|
addi a1, a1, (2 * SZREG) |
|
srl t1, t1, a6 |
|
sll t2, t0, a7 |
|
or t2, t1, t2 |
|
REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) |
|
|
|
bne t3, t6, 1b |
|
2: |
|
mv t3, t6 /* Fix the dest pointer in case the loop was broken */ |
|
|
|
add a1, t3, a5 /* Restore the src pointer */ |
|
j byte_copy_forward /* Copy any remaining bytes */ |
|
|
|
misaligned_fixup_copy_reverse: |
|
jal t0, byte_copy_until_aligned_reverse |
|
|
|
andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ |
|
slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ |
|
sub a5, a4, t4 /* Find the difference between src and dest */ |
|
andi a4, a4, -SZREG /* Align the src pointer */ |
|
addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ |
|
|
|
/* |
|
* Compute The Inverse Shift |
|
* a7 = XLEN - a6 = XLEN + -a6 |
|
* 2s complement negation to find the negative: -a6 = ~a6 + 1 |
|
* Add that to XLEN. XLEN = SZREG * 8. |
|
*/ |
|
not a7, a6 |
|
addi a7, a7, (SZREG * 8 + 1) |
|
|
|
/* |
|
* Fix Misalignment Copy Loop - Reverse |
|
* load_val1 = load_ptr[0]; |
|
* do { |
|
* load_val0 = load_ptr[-1]; |
|
* store_ptr -= 2; |
|
* store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); |
|
* |
|
* if (store_ptr == {a2}) |
|
* break; |
|
* |
|
* load_val1 = load_ptr[-2]; |
|
* load_ptr -= 2; |
|
* store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); |
|
* |
|
* } while (store_ptr != store_ptr_end); |
|
* store_ptr = store_ptr_end; |
|
*/ |
|
|
|
REG_L t1, ( 0 * SZREG)(a4) |
|
1: |
|
REG_L t0, (-1 * SZREG)(a4) |
|
addi t4, t4, (-2 * SZREG) |
|
sll t1, t1, a7 |
|
srl t2, t0, a6 |
|
or t2, t1, t2 |
|
REG_S t2, ( 1 * SZREG)(t4) |
|
|
|
beq t4, a2, 2f |
|
|
|
REG_L t1, (-2 * SZREG)(a4) |
|
addi a4, a4, (-2 * SZREG) |
|
sll t0, t0, a7 |
|
srl t2, t1, a6 |
|
or t2, t0, t2 |
|
REG_S t2, ( 0 * SZREG)(t4) |
|
|
|
bne t4, t5, 1b |
|
2: |
|
mv t4, t5 /* Fix the dest pointer in case the loop was broken */ |
|
|
|
add a4, t4, a5 /* Restore the src pointer */ |
|
j byte_copy_reverse /* Copy any remaining bytes */ |
|
|
|
/* |
|
* Simple copy loops for SZREG co-aligned memory locations. |
|
* These also make calls to do byte copies for any unaligned |
|
* data at their terminations. |
|
*/ |
|
coaligned_copy: |
|
bltu a1, a0, coaligned_copy_reverse |
|
|
|
coaligned_copy_forward: |
|
jal t0, byte_copy_until_aligned_forward |
|
|
|
1: |
|
REG_L t1, ( 0 * SZREG)(a1) |
|
addi a1, a1, SZREG |
|
addi t3, t3, SZREG |
|
REG_S t1, (-1 * SZREG)(t3) |
|
bne t3, t6, 1b |
|
|
|
j byte_copy_forward /* Copy any remaining bytes */ |
|
|
|
coaligned_copy_reverse: |
|
jal t0, byte_copy_until_aligned_reverse |
|
|
|
1: |
|
REG_L t1, (-1 * SZREG)(a4) |
|
addi a4, a4, -SZREG |
|
addi t4, t4, -SZREG |
|
REG_S t1, ( 0 * SZREG)(t4) |
|
bne t4, t5, 1b |
|
|
|
j byte_copy_reverse /* Copy any remaining bytes */ |
|
|
|
/* |
|
* These are basically sub-functions within the function. They |
|
* are used to byte copy until the dest pointer is in alignment. |
|
* At which point, a bulk copy method can be used by the |
|
* calling code. These work on the same registers as the bulk |
|
* copy loops. Therefore, the register values can be picked |
|
* up from where they were left and we avoid code duplication |
|
* without any overhead except the call in and return jumps. |
|
*/ |
|
byte_copy_until_aligned_forward: |
|
beq t3, t5, 2f |
|
1: |
|
lb t1, 0(a1) |
|
addi a1, a1, 1 |
|
addi t3, t3, 1 |
|
sb t1, -1(t3) |
|
bne t3, t5, 1b |
|
2: |
|
jalr zero, 0x0(t0) /* Return to multibyte copy loop */ |
|
|
|
byte_copy_until_aligned_reverse: |
|
beq t4, t6, 2f |
|
1: |
|
lb t1, -1(a4) |
|
addi a4, a4, -1 |
|
addi t4, t4, -1 |
|
sb t1, 0(t4) |
|
bne t4, t6, 1b |
|
2: |
|
jalr zero, 0x0(t0) /* Return to multibyte copy loop */ |
|
|
|
/* |
|
* Simple byte copy loops. |
|
* These will byte copy until they reach the end of data to copy. |
|
* At that point, they will call to return from memmove. |
|
*/ |
|
byte_copy: |
|
bltu a1, a0, byte_copy_reverse |
|
|
|
byte_copy_forward: |
|
beq t3, t4, 2f |
|
1: |
|
lb t1, 0(a1) |
|
addi a1, a1, 1 |
|
addi t3, t3, 1 |
|
sb t1, -1(t3) |
|
bne t3, t4, 1b |
|
2: |
|
ret |
|
|
|
byte_copy_reverse: |
|
beq t4, t3, 2f |
|
1: |
|
lb t1, -1(a4) |
|
addi a4, a4, -1 |
|
addi t4, t4, -1 |
|
sb t1, 0(t4) |
|
bne t4, t3, 1b |
|
2: |
|
|
|
return_from_memmove: |
|
ret |
|
|
|
SYM_FUNC_END(memmove) |
|
SYM_FUNC_END(__memmove)
|
|
|