mirror of https://github.com/Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
140 lines
2.8 KiB
140 lines
2.8 KiB
/* SPDX-License-Identifier: GPL-2.0 */ |
|
/* Copyright 2002 Andi Kleen, SuSE Labs */ |
|
|
|
#include <linux/linkage.h> |
|
#include <asm/cpufeatures.h> |
|
#include <asm/alternative-asm.h> |
|
#include <asm/export.h> |
|
|
|
/* |
|
* ISO C memset - set a memory block to a byte value. This function uses fast |
|
* string to get better performance than the original function. The code is |
|
* simpler and shorter than the original function as well. |
|
* |
|
* rdi destination |
|
* rsi value (char) |
|
* rdx count (bytes) |
|
* |
|
* rax original destination |
|
*/ |
|
SYM_FUNC_START_WEAK(memset) |
|
SYM_FUNC_START(__memset) |
|
/* |
|
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended |
|
* to use it when possible. If not available, use fast string instructions. |
|
* |
|
* Otherwise, use original memset function. |
|
*/ |
|
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ |
|
"jmp memset_erms", X86_FEATURE_ERMS |
|
|
|
movq %rdi,%r9 |
|
movq %rdx,%rcx |
|
andl $7,%edx |
|
shrq $3,%rcx |
|
/* expand byte value */ |
|
movzbl %sil,%esi |
|
movabs $0x0101010101010101,%rax |
|
imulq %rsi,%rax |
|
rep stosq |
|
movl %edx,%ecx |
|
rep stosb |
|
movq %r9,%rax |
|
ret |
|
SYM_FUNC_END(__memset) |
|
SYM_FUNC_END_ALIAS(memset) |
|
EXPORT_SYMBOL(memset) |
|
EXPORT_SYMBOL(__memset) |
|
|
|
/* |
|
* ISO C memset - set a memory block to a byte value. This function uses |
|
* enhanced rep stosb to override the fast string function. |
|
* The code is simpler and shorter than the fast string function as well. |
|
* |
|
* rdi destination |
|
* rsi value (char) |
|
* rdx count (bytes) |
|
* |
|
* rax original destination |
|
*/ |
|
SYM_FUNC_START_LOCAL(memset_erms) |
|
movq %rdi,%r9 |
|
movb %sil,%al |
|
movq %rdx,%rcx |
|
rep stosb |
|
movq %r9,%rax |
|
ret |
|
SYM_FUNC_END(memset_erms) |
|
|
|
SYM_FUNC_START_LOCAL(memset_orig) |
|
movq %rdi,%r10 |
|
|
|
/* expand byte value */ |
|
movzbl %sil,%ecx |
|
movabs $0x0101010101010101,%rax |
|
imulq %rcx,%rax |
|
|
|
/* align dst */ |
|
movl %edi,%r9d |
|
andl $7,%r9d |
|
jnz .Lbad_alignment |
|
.Lafter_bad_alignment: |
|
|
|
movq %rdx,%rcx |
|
shrq $6,%rcx |
|
jz .Lhandle_tail |
|
|
|
.p2align 4 |
|
.Lloop_64: |
|
decq %rcx |
|
movq %rax,(%rdi) |
|
movq %rax,8(%rdi) |
|
movq %rax,16(%rdi) |
|
movq %rax,24(%rdi) |
|
movq %rax,32(%rdi) |
|
movq %rax,40(%rdi) |
|
movq %rax,48(%rdi) |
|
movq %rax,56(%rdi) |
|
leaq 64(%rdi),%rdi |
|
jnz .Lloop_64 |
|
|
|
/* Handle tail in loops. The loops should be faster than hard |
|
to predict jump tables. */ |
|
.p2align 4 |
|
.Lhandle_tail: |
|
movl %edx,%ecx |
|
andl $63&(~7),%ecx |
|
jz .Lhandle_7 |
|
shrl $3,%ecx |
|
.p2align 4 |
|
.Lloop_8: |
|
decl %ecx |
|
movq %rax,(%rdi) |
|
leaq 8(%rdi),%rdi |
|
jnz .Lloop_8 |
|
|
|
.Lhandle_7: |
|
andl $7,%edx |
|
jz .Lende |
|
.p2align 4 |
|
.Lloop_1: |
|
decl %edx |
|
movb %al,(%rdi) |
|
leaq 1(%rdi),%rdi |
|
jnz .Lloop_1 |
|
|
|
.Lende: |
|
movq %r10,%rax |
|
ret |
|
|
|
.Lbad_alignment: |
|
cmpq $7,%rdx |
|
jbe .Lhandle_7 |
|
movq %rax,(%rdi) /* unaligned store */ |
|
movq $8,%r8 |
|
subq %r9,%r8 |
|
addq %r8,%rdi |
|
subq %r8,%rdx |
|
jmp .Lafter_bad_alignment |
|
.Lfinal: |
|
SYM_FUNC_END(memset_orig)
|
|
|