QortalOS Brooklyn for Raspberry Pi 4
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

285 lines
8.5 KiB

/*
Copyright (c) 2013, Raspberry Pi Foundation
Copyright (c) 2013, RISC OS Open Ltd
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include "arm-mem.h"
/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
.text
.arch armv6
.object_arch armv4
.arm
.altmacro
.p2align 2
.macro memcmp_process_head unaligned
.if unaligned
ldr DAT0, [S_1], #4
ldr DAT1, [S_1], #4
ldr DAT2, [S_1], #4
ldr DAT3, [S_1], #4
.else
ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
.endif
ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
.endm
.macro memcmp_process_tail
cmp DAT0, DAT4
cmpeq DAT1, DAT5
cmpeq DAT2, DAT6
cmpeq DAT3, DAT7
bne 200f
.endm
.macro memcmp_leading_31bytes
movs DAT0, OFF, lsl #31
ldrmib DAT0, [S_1], #1
ldrcsh DAT1, [S_1], #2
ldrmib DAT4, [S_2], #1
ldrcsh DAT5, [S_2], #2
movpl DAT0, #0
movcc DAT1, #0
movpl DAT4, #0
movcc DAT5, #0
submi N, N, #1
subcs N, N, #2
cmp DAT0, DAT4
cmpeq DAT1, DAT5
bne 200f
movs DAT0, OFF, lsl #29
ldrmi DAT0, [S_1], #4
ldrcs DAT1, [S_1], #4
ldrcs DAT2, [S_1], #4
ldrmi DAT4, [S_2], #4
ldmcsia S_2!, {DAT5, DAT6}
movpl DAT0, #0
movcc DAT1, #0
movcc DAT2, #0
movpl DAT4, #0
movcc DAT5, #0
movcc DAT6, #0
submi N, N, #4
subcs N, N, #8
cmp DAT0, DAT4
cmpeq DAT1, DAT5
cmpeq DAT2, DAT6
bne 200f
tst OFF, #16
beq 105f
memcmp_process_head 1
sub N, N, #16
memcmp_process_tail
105:
.endm
.macro memcmp_trailing_15bytes unaligned
movs N, N, lsl #29
.if unaligned
ldrcs DAT0, [S_1], #4
ldrcs DAT1, [S_1], #4
.else
ldmcsia S_1!, {DAT0, DAT1}
.endif
ldrmi DAT2, [S_1], #4
ldmcsia S_2!, {DAT4, DAT5}
ldrmi DAT6, [S_2], #4
movcc DAT0, #0
movcc DAT1, #0
movpl DAT2, #0
movcc DAT4, #0
movcc DAT5, #0
movpl DAT6, #0
cmp DAT0, DAT4
cmpeq DAT1, DAT5
cmpeq DAT2, DAT6
bne 200f
movs N, N, lsl #2
ldrcsh DAT0, [S_1], #2
ldrmib DAT1, [S_1]
ldrcsh DAT4, [S_2], #2
ldrmib DAT5, [S_2]
movcc DAT0, #0
movpl DAT1, #0
movcc DAT4, #0
movpl DAT5, #0
cmp DAT0, DAT4
cmpeq DAT1, DAT5
bne 200f
.endm
.macro memcmp_long_inner_loop unaligned
110:
memcmp_process_head unaligned
pld [S_2, #prefetch_distance*32 + 16]
memcmp_process_tail
memcmp_process_head unaligned
pld [S_1, OFF]
memcmp_process_tail
subs N, N, #32
bhs 110b
/* Just before the final (prefetch_distance+1) 32-byte blocks,
* deal with final preloads */
preload_trailing 0, S_1, N, DAT0
preload_trailing 0, S_2, N, DAT0
add N, N, #(prefetch_distance+2)*32 - 16
120:
memcmp_process_head unaligned
memcmp_process_tail
subs N, N, #16
bhs 120b
/* Trailing words and bytes */
tst N, #15
beq 199f
memcmp_trailing_15bytes unaligned
199: /* Reached end without detecting a difference */
mov a1, #0
setend le
pop {DAT1-DAT6, pc}
.endm
.macro memcmp_short_inner_loop unaligned
subs N, N, #16 /* simplifies inner loop termination */
blo 122f
120:
memcmp_process_head unaligned
memcmp_process_tail
subs N, N, #16
bhs 120b
122: /* Trailing words and bytes */
tst N, #15
beq 199f
memcmp_trailing_15bytes unaligned
199: /* Reached end without detecting a difference */
mov a1, #0
setend le
pop {DAT1-DAT6, pc}
.endm
/*
* int memcmp(const void *s1, const void *s2, size_t n);
* On entry:
* a1 = pointer to buffer 1
* a2 = pointer to buffer 2
* a3 = number of bytes to compare (as unsigned chars)
* On exit:
* a1 = >0/=0/<0 if s1 >/=/< s2
*/
.set prefetch_distance, 2
ENTRY(memcmp)
S_1 .req a1
S_2 .req a2
N .req a3
DAT0 .req a4
DAT1 .req v1
DAT2 .req v2
DAT3 .req v3
DAT4 .req v4
DAT5 .req v5
DAT6 .req v6
DAT7 .req ip
OFF .req lr
push {DAT1-DAT6, lr}
setend be /* lowest-addressed bytes are most significant */
/* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
cmp N, #(prefetch_distance+3)*32 - 1
blo 170f
/* Long case */
/* Adjust N so that the decrement instruction can also test for
* inner loop termination. We want it to stop when there are
* (prefetch_distance+1) complete blocks to go. */
sub N, N, #(prefetch_distance+2)*32
preload_leading_step1 0, DAT0, S_1
preload_leading_step1 0, DAT1, S_2
tst S_2, #31
beq 154f
rsb OFF, S_2, #0 /* no need to AND with 15 here */
preload_leading_step2 0, DAT0, S_1, OFF, DAT2
preload_leading_step2 0, DAT1, S_2, OFF, DAT2
memcmp_leading_31bytes
154: /* Second source now cacheline (32-byte) aligned; we have at
* least one prefetch to go. */
/* Prefetch offset is best selected such that it lies in the
* first 8 of each 32 bytes - but it's just as easy to aim for
* the first one */
and OFF, S_1, #31
rsb OFF, OFF, #32*prefetch_distance
tst S_1, #3
bne 140f
memcmp_long_inner_loop 0
140: memcmp_long_inner_loop 1
170: /* Short case */
teq N, #0
beq 199f
preload_all 0, 0, 0, S_1, N, DAT0, DAT1
preload_all 0, 0, 0, S_2, N, DAT0, DAT1
tst S_2, #3
beq 174f
172: subs N, N, #1
blo 199f
ldrb DAT0, [S_1], #1
ldrb DAT4, [S_2], #1
cmp DAT0, DAT4
bne 200f
tst S_2, #3
bne 172b
174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
tst S_1, #3
bne 140f
memcmp_short_inner_loop 0
140: memcmp_short_inner_loop 1
200: /* Difference found: determine sign. */
movhi a1, #1
movlo a1, #-1
setend le
pop {DAT1-DAT6, pc}
.unreq S_1
.unreq S_2
.unreq N
.unreq DAT0
.unreq DAT1
.unreq DAT2
.unreq DAT3
.unreq DAT4
.unreq DAT5
.unreq DAT6
.unreq DAT7
.unreq OFF
ENDPROC(memcmp)