forked from Qortal/Brooklyn
286 lines
8.5 KiB
ArmAsm
286 lines
8.5 KiB
ArmAsm
|
/*
|
||
|
Copyright (c) 2013, Raspberry Pi Foundation
|
||
|
Copyright (c) 2013, RISC OS Open Ltd
|
||
|
All rights reserved.
|
||
|
|
||
|
Redistribution and use in source and binary forms, with or without
|
||
|
modification, are permitted provided that the following conditions are met:
|
||
|
* Redistributions of source code must retain the above copyright
|
||
|
notice, this list of conditions and the following disclaimer.
|
||
|
* Redistributions in binary form must reproduce the above copyright
|
||
|
notice, this list of conditions and the following disclaimer in the
|
||
|
documentation and/or other materials provided with the distribution.
|
||
|
* Neither the name of the copyright holder nor the
|
||
|
names of its contributors may be used to endorse or promote products
|
||
|
derived from this software without specific prior written permission.
|
||
|
|
||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
|
||
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#include <linux/linkage.h>
|
||
|
#include "arm-mem.h"
|
||
|
|
||
|
/* Prevent the stack from becoming executable */
|
||
|
#if defined(__linux__) && defined(__ELF__)
|
||
|
.section .note.GNU-stack,"",%progbits
|
||
|
#endif
|
||
|
|
||
|
.text
|
||
|
.arch armv6
|
||
|
.object_arch armv4
|
||
|
.arm
|
||
|
.altmacro
|
||
|
.p2align 2
|
||
|
|
||
|
.macro memcmp_process_head unaligned
|
||
|
.if unaligned
|
||
|
ldr DAT0, [S_1], #4
|
||
|
ldr DAT1, [S_1], #4
|
||
|
ldr DAT2, [S_1], #4
|
||
|
ldr DAT3, [S_1], #4
|
||
|
.else
|
||
|
ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
|
||
|
.endif
|
||
|
ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
|
||
|
.endm
|
||
|
|
||
|
.macro memcmp_process_tail
|
||
|
cmp DAT0, DAT4
|
||
|
cmpeq DAT1, DAT5
|
||
|
cmpeq DAT2, DAT6
|
||
|
cmpeq DAT3, DAT7
|
||
|
bne 200f
|
||
|
.endm
|
||
|
|
||
|
.macro memcmp_leading_31bytes
|
||
|
movs DAT0, OFF, lsl #31
|
||
|
ldrmib DAT0, [S_1], #1
|
||
|
ldrcsh DAT1, [S_1], #2
|
||
|
ldrmib DAT4, [S_2], #1
|
||
|
ldrcsh DAT5, [S_2], #2
|
||
|
movpl DAT0, #0
|
||
|
movcc DAT1, #0
|
||
|
movpl DAT4, #0
|
||
|
movcc DAT5, #0
|
||
|
submi N, N, #1
|
||
|
subcs N, N, #2
|
||
|
cmp DAT0, DAT4
|
||
|
cmpeq DAT1, DAT5
|
||
|
bne 200f
|
||
|
movs DAT0, OFF, lsl #29
|
||
|
ldrmi DAT0, [S_1], #4
|
||
|
ldrcs DAT1, [S_1], #4
|
||
|
ldrcs DAT2, [S_1], #4
|
||
|
ldrmi DAT4, [S_2], #4
|
||
|
ldmcsia S_2!, {DAT5, DAT6}
|
||
|
movpl DAT0, #0
|
||
|
movcc DAT1, #0
|
||
|
movcc DAT2, #0
|
||
|
movpl DAT4, #0
|
||
|
movcc DAT5, #0
|
||
|
movcc DAT6, #0
|
||
|
submi N, N, #4
|
||
|
subcs N, N, #8
|
||
|
cmp DAT0, DAT4
|
||
|
cmpeq DAT1, DAT5
|
||
|
cmpeq DAT2, DAT6
|
||
|
bne 200f
|
||
|
tst OFF, #16
|
||
|
beq 105f
|
||
|
memcmp_process_head 1
|
||
|
sub N, N, #16
|
||
|
memcmp_process_tail
|
||
|
105:
|
||
|
.endm
|
||
|
|
||
|
.macro memcmp_trailing_15bytes unaligned
|
||
|
movs N, N, lsl #29
|
||
|
.if unaligned
|
||
|
ldrcs DAT0, [S_1], #4
|
||
|
ldrcs DAT1, [S_1], #4
|
||
|
.else
|
||
|
ldmcsia S_1!, {DAT0, DAT1}
|
||
|
.endif
|
||
|
ldrmi DAT2, [S_1], #4
|
||
|
ldmcsia S_2!, {DAT4, DAT5}
|
||
|
ldrmi DAT6, [S_2], #4
|
||
|
movcc DAT0, #0
|
||
|
movcc DAT1, #0
|
||
|
movpl DAT2, #0
|
||
|
movcc DAT4, #0
|
||
|
movcc DAT5, #0
|
||
|
movpl DAT6, #0
|
||
|
cmp DAT0, DAT4
|
||
|
cmpeq DAT1, DAT5
|
||
|
cmpeq DAT2, DAT6
|
||
|
bne 200f
|
||
|
movs N, N, lsl #2
|
||
|
ldrcsh DAT0, [S_1], #2
|
||
|
ldrmib DAT1, [S_1]
|
||
|
ldrcsh DAT4, [S_2], #2
|
||
|
ldrmib DAT5, [S_2]
|
||
|
movcc DAT0, #0
|
||
|
movpl DAT1, #0
|
||
|
movcc DAT4, #0
|
||
|
movpl DAT5, #0
|
||
|
cmp DAT0, DAT4
|
||
|
cmpeq DAT1, DAT5
|
||
|
bne 200f
|
||
|
.endm
|
||
|
|
||
|
.macro memcmp_long_inner_loop unaligned
|
||
|
110:
|
||
|
memcmp_process_head unaligned
|
||
|
pld [S_2, #prefetch_distance*32 + 16]
|
||
|
memcmp_process_tail
|
||
|
memcmp_process_head unaligned
|
||
|
pld [S_1, OFF]
|
||
|
memcmp_process_tail
|
||
|
subs N, N, #32
|
||
|
bhs 110b
|
||
|
/* Just before the final (prefetch_distance+1) 32-byte blocks,
|
||
|
* deal with final preloads */
|
||
|
preload_trailing 0, S_1, N, DAT0
|
||
|
preload_trailing 0, S_2, N, DAT0
|
||
|
add N, N, #(prefetch_distance+2)*32 - 16
|
||
|
120:
|
||
|
memcmp_process_head unaligned
|
||
|
memcmp_process_tail
|
||
|
subs N, N, #16
|
||
|
bhs 120b
|
||
|
/* Trailing words and bytes */
|
||
|
tst N, #15
|
||
|
beq 199f
|
||
|
memcmp_trailing_15bytes unaligned
|
||
|
199: /* Reached end without detecting a difference */
|
||
|
mov a1, #0
|
||
|
setend le
|
||
|
pop {DAT1-DAT6, pc}
|
||
|
.endm
|
||
|
|
||
|
.macro memcmp_short_inner_loop unaligned
|
||
|
subs N, N, #16 /* simplifies inner loop termination */
|
||
|
blo 122f
|
||
|
120:
|
||
|
memcmp_process_head unaligned
|
||
|
memcmp_process_tail
|
||
|
subs N, N, #16
|
||
|
bhs 120b
|
||
|
122: /* Trailing words and bytes */
|
||
|
tst N, #15
|
||
|
beq 199f
|
||
|
memcmp_trailing_15bytes unaligned
|
||
|
199: /* Reached end without detecting a difference */
|
||
|
mov a1, #0
|
||
|
setend le
|
||
|
pop {DAT1-DAT6, pc}
|
||
|
.endm
|
||
|
|
||
|
/*
|
||
|
* int memcmp(const void *s1, const void *s2, size_t n);
|
||
|
* On entry:
|
||
|
* a1 = pointer to buffer 1
|
||
|
* a2 = pointer to buffer 2
|
||
|
* a3 = number of bytes to compare (as unsigned chars)
|
||
|
* On exit:
|
||
|
* a1 = >0/=0/<0 if s1 >/=/< s2
|
||
|
*/
|
||
|
|
||
|
.set prefetch_distance, 2
|
||
|
|
||
|
ENTRY(memcmp)
|
||
|
S_1 .req a1
|
||
|
S_2 .req a2
|
||
|
N .req a3
|
||
|
DAT0 .req a4
|
||
|
DAT1 .req v1
|
||
|
DAT2 .req v2
|
||
|
DAT3 .req v3
|
||
|
DAT4 .req v4
|
||
|
DAT5 .req v5
|
||
|
DAT6 .req v6
|
||
|
DAT7 .req ip
|
||
|
OFF .req lr
|
||
|
|
||
|
push {DAT1-DAT6, lr}
|
||
|
setend be /* lowest-addressed bytes are most significant */
|
||
|
|
||
|
/* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
|
||
|
cmp N, #(prefetch_distance+3)*32 - 1
|
||
|
blo 170f
|
||
|
|
||
|
/* Long case */
|
||
|
/* Adjust N so that the decrement instruction can also test for
|
||
|
* inner loop termination. We want it to stop when there are
|
||
|
* (prefetch_distance+1) complete blocks to go. */
|
||
|
sub N, N, #(prefetch_distance+2)*32
|
||
|
preload_leading_step1 0, DAT0, S_1
|
||
|
preload_leading_step1 0, DAT1, S_2
|
||
|
tst S_2, #31
|
||
|
beq 154f
|
||
|
rsb OFF, S_2, #0 /* no need to AND with 15 here */
|
||
|
preload_leading_step2 0, DAT0, S_1, OFF, DAT2
|
||
|
preload_leading_step2 0, DAT1, S_2, OFF, DAT2
|
||
|
memcmp_leading_31bytes
|
||
|
154: /* Second source now cacheline (32-byte) aligned; we have at
|
||
|
* least one prefetch to go. */
|
||
|
/* Prefetch offset is best selected such that it lies in the
|
||
|
* first 8 of each 32 bytes - but it's just as easy to aim for
|
||
|
* the first one */
|
||
|
and OFF, S_1, #31
|
||
|
rsb OFF, OFF, #32*prefetch_distance
|
||
|
tst S_1, #3
|
||
|
bne 140f
|
||
|
memcmp_long_inner_loop 0
|
||
|
140: memcmp_long_inner_loop 1
|
||
|
|
||
|
170: /* Short case */
|
||
|
teq N, #0
|
||
|
beq 199f
|
||
|
preload_all 0, 0, 0, S_1, N, DAT0, DAT1
|
||
|
preload_all 0, 0, 0, S_2, N, DAT0, DAT1
|
||
|
tst S_2, #3
|
||
|
beq 174f
|
||
|
172: subs N, N, #1
|
||
|
blo 199f
|
||
|
ldrb DAT0, [S_1], #1
|
||
|
ldrb DAT4, [S_2], #1
|
||
|
cmp DAT0, DAT4
|
||
|
bne 200f
|
||
|
tst S_2, #3
|
||
|
bne 172b
|
||
|
174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
|
||
|
tst S_1, #3
|
||
|
bne 140f
|
||
|
memcmp_short_inner_loop 0
|
||
|
140: memcmp_short_inner_loop 1
|
||
|
|
||
|
200: /* Difference found: determine sign. */
|
||
|
movhi a1, #1
|
||
|
movlo a1, #-1
|
||
|
setend le
|
||
|
pop {DAT1-DAT6, pc}
|
||
|
|
||
|
.unreq S_1
|
||
|
.unreq S_2
|
||
|
.unreq N
|
||
|
.unreq DAT0
|
||
|
.unreq DAT1
|
||
|
.unreq DAT2
|
||
|
.unreq DAT3
|
||
|
.unreq DAT4
|
||
|
.unreq DAT5
|
||
|
.unreq DAT6
|
||
|
.unreq DAT7
|
||
|
.unreq OFF
|
||
|
ENDPROC(memcmp)
|