forked from Qortal/Brooklyn
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
205 lines
4.2 KiB
205 lines
4.2 KiB
/* SPDX-License-Identifier: GPL-2.0 */ |
|
/* |
|
* arch/alpha/lib/ev6-copy_page.S |
|
* |
|
* Copy an entire page. |
|
*/ |
|
|
|
/* The following comparison of this routine vs the normal copy_page.S |
|
was written by an unnamed ev6 hardware designer and forwarded to me |
|
via Steven Hobbs <hobbs@steven.zko.dec.com>. |
|
|
|
First Problem: STQ overflows. |
|
----------------------------- |
|
|
|
It would be nice if EV6 handled every resource overflow efficiently, |
|
but for some it doesn't. Including store queue overflows. It causes |
|
a trap and a restart of the pipe. |
|
|
|
To get around this we sometimes use (to borrow a term from a VSSAD |
|
researcher) "aeration". The idea is to slow the rate at which the |
|
processor receives valid instructions by inserting nops in the fetch |
|
path. In doing so, you can prevent the overflow and actually make |
|
the code run faster. You can, of course, take advantage of the fact |
|
that the processor can fetch at most 4 aligned instructions per cycle. |
|
|
|
I inserted enough nops to force it to take 10 cycles to fetch the |
|
loop code. In theory, EV6 should be able to execute this loop in |
|
9 cycles but I was not able to get it to run that fast -- the initial |
|
conditions were such that I could not reach this optimum rate on |
|
(chaotic) EV6. I wrote the code such that everything would issue |
|
in order. |
|
|
|
Second Problem: Dcache index matches. |
|
------------------------------------- |
|
|
|
If you are going to use this routine on random aligned pages, there |
|
is a 25% chance that the pages will be at the same dcache indices. |
|
This results in many nasty memory traps without care. |
|
|
|
The solution is to schedule the prefetches to avoid the memory |
|
conflicts. I schedule the wh64 prefetches farther ahead of the |
|
read prefetches to avoid this problem. |
|
|
|
Third Problem: Needs more prefetching. |
|
-------------------------------------- |
|
|
|
In order to improve the code I added deeper prefetching to take the |
|
most advantage of EV6's bandwidth. |
|
|
|
I also prefetched the read stream. Note that adding the read prefetch |
|
forced me to add another cycle to the inner-most kernel - up to 11 |
|
from the original 8 cycles per iteration. We could improve performance |
|
further by unrolling the loop and doing multiple prefetches per cycle. |
|
|
|
I think that the code below will be very robust and fast code for the |
|
purposes of copying aligned pages. It is slower when both source and |
|
destination pages are in the dcache, but it is my guess that this is |
|
less important than the dcache miss case. */ |
|
|
|
#include <asm/export.h> |
|
.text |
|
.align 4 |
|
.global copy_page |
|
.ent copy_page |
|
copy_page: |
|
.prologue 0 |
|
|
|
/* Prefetch 5 read cachelines; write-hint 10 cache lines. */ |
|
wh64 ($16) |
|
ldl $31,0($17) |
|
ldl $31,64($17) |
|
lda $1,1*64($16) |
|
|
|
wh64 ($1) |
|
ldl $31,128($17) |
|
ldl $31,192($17) |
|
lda $1,2*64($16) |
|
|
|
wh64 ($1) |
|
ldl $31,256($17) |
|
lda $18,118 |
|
lda $1,3*64($16) |
|
|
|
wh64 ($1) |
|
nop |
|
lda $1,4*64($16) |
|
lda $2,5*64($16) |
|
|
|
wh64 ($1) |
|
wh64 ($2) |
|
lda $1,6*64($16) |
|
lda $2,7*64($16) |
|
|
|
wh64 ($1) |
|
wh64 ($2) |
|
lda $1,8*64($16) |
|
lda $2,9*64($16) |
|
|
|
wh64 ($1) |
|
wh64 ($2) |
|
lda $19,10*64($16) |
|
nop |
|
|
|
/* Main prefetching/write-hinting loop. */ |
|
1: ldq $0,0($17) |
|
ldq $1,8($17) |
|
unop |
|
unop |
|
|
|
unop |
|
unop |
|
ldq $2,16($17) |
|
ldq $3,24($17) |
|
|
|
ldq $4,32($17) |
|
ldq $5,40($17) |
|
unop |
|
unop |
|
|
|
unop |
|
unop |
|
ldq $6,48($17) |
|
ldq $7,56($17) |
|
|
|
ldl $31,320($17) |
|
unop |
|
unop |
|
unop |
|
|
|
/* This gives the extra cycle of aeration above the minimum. */ |
|
unop |
|
unop |
|
unop |
|
unop |
|
|
|
wh64 ($19) |
|
unop |
|
unop |
|
unop |
|
|
|
stq $0,0($16) |
|
subq $18,1,$18 |
|
stq $1,8($16) |
|
unop |
|
|
|
unop |
|
stq $2,16($16) |
|
addq $17,64,$17 |
|
stq $3,24($16) |
|
|
|
stq $4,32($16) |
|
stq $5,40($16) |
|
addq $19,64,$19 |
|
unop |
|
|
|
stq $6,48($16) |
|
stq $7,56($16) |
|
addq $16,64,$16 |
|
bne $18, 1b |
|
|
|
/* Prefetch the final 5 cache lines of the read stream. */ |
|
lda $18,10 |
|
ldl $31,320($17) |
|
ldl $31,384($17) |
|
ldl $31,448($17) |
|
|
|
ldl $31,512($17) |
|
ldl $31,576($17) |
|
nop |
|
nop |
|
|
|
/* Non-prefetching, non-write-hinting cleanup loop for the |
|
final 10 cache lines. */ |
|
2: ldq $0,0($17) |
|
ldq $1,8($17) |
|
ldq $2,16($17) |
|
ldq $3,24($17) |
|
|
|
ldq $4,32($17) |
|
ldq $5,40($17) |
|
ldq $6,48($17) |
|
ldq $7,56($17) |
|
|
|
stq $0,0($16) |
|
subq $18,1,$18 |
|
stq $1,8($16) |
|
addq $17,64,$17 |
|
|
|
stq $2,16($16) |
|
stq $3,24($16) |
|
stq $4,32($16) |
|
stq $5,40($16) |
|
|
|
stq $6,48($16) |
|
stq $7,56($16) |
|
addq $16,64,$16 |
|
bne $18, 2b |
|
|
|
ret |
|
nop |
|
unop |
|
nop |
|
|
|
.end copy_page |
|
EXPORT_SYMBOL(copy_page)
|
|
|