|
@@ -0,0 +1,185 @@
|
|
|
+! Copyright (C) 2008-2012 Imagination Technologies Ltd.
|
|
|
+
|
|
|
+ .text
|
|
|
+ .global _memcpy
|
|
|
+ .type _memcpy,function
|
|
|
+! D1Ar1 dst
|
|
|
+! D0Ar2 src
|
|
|
+! D1Ar3 cnt
|
|
|
+! D0Re0 dst
|
|
|
+_memcpy:
|
|
|
+ CMP D1Ar3, #16
|
|
|
+ MOV A1.2, D0Ar2 ! source pointer
|
|
|
+ MOV A0.2, D1Ar1 ! destination pointer
|
|
|
+ MOV A0.3, D1Ar1 ! for return value
|
|
|
+! If there are less than 16 bytes to copy use the byte copy loop
|
|
|
+ BGE $Llong_copy
|
|
|
+
|
|
|
+$Lbyte_copy:
|
|
|
+! Simply copy a byte at a time
|
|
|
+ SUBS TXRPT, D1Ar3, #1
|
|
|
+ BLT $Lend
|
|
|
+$Lloop_byte:
|
|
|
+ GETB D1Re0, [A1.2++]
|
|
|
+ SETB [A0.2++], D1Re0
|
|
|
+ BR $Lloop_byte
|
|
|
+
|
|
|
+$Lend:
|
|
|
+! Finally set return value and return
|
|
|
+ MOV D0Re0, A0.3
|
|
|
+ MOV PC, D1RtP
|
|
|
+
|
|
|
+$Llong_copy:
|
|
|
+ ANDS D1Ar5, D1Ar1, #7 ! test destination alignment
|
|
|
+ BZ $Laligned_dst
|
|
|
+
|
|
|
+! The destination address is not 8 byte aligned. We will copy bytes from
|
|
|
+! the source to the destination until the remaining data has an 8 byte
|
|
|
+! destination address alignment (i.e we should never copy more than 7
|
|
|
+! bytes here).
|
|
|
+$Lalign_dst:
|
|
|
+ GETB D0Re0, [A1.2++]
|
|
|
+ ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8
|
|
|
+ SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes
|
|
|
+ SETB [A0.2++], D0Re0
|
|
|
+ CMP D1Ar5, #8
|
|
|
+ BNE $Lalign_dst
|
|
|
+
|
|
|
+! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
|
|
|
+! blocks, then jump to the unaligned copy loop or fall through to the aligned
|
|
|
+! copy loop as appropriate.
|
|
|
+$Laligned_dst:
|
|
|
+ MOV D0Ar4, A1.2
|
|
|
+ LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks
|
|
|
+ ANDS D0Ar4, D0Ar4, #7 ! test source alignment
|
|
|
+ BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop
|
|
|
+
|
|
|
+! Both source and destination are 8 byte aligned - the easy case.
|
|
|
+$Laligned_copy:
|
|
|
+ LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks
|
|
|
+ BZ $Lbyte_copy
|
|
|
+ SUB TXRPT, D1Ar5, #1
|
|
|
+
|
|
|
+$Laligned_32:
|
|
|
+ GETL D0Re0, D1Re0, [A1.2++]
|
|
|
+ GETL D0Ar6, D1Ar5, [A1.2++]
|
|
|
+ SETL [A0.2++], D0Re0, D1Re0
|
|
|
+ SETL [A0.2++], D0Ar6, D1Ar5
|
|
|
+ GETL D0Re0, D1Re0, [A1.2++]
|
|
|
+ GETL D0Ar6, D1Ar5, [A1.2++]
|
|
|
+ SETL [A0.2++], D0Re0, D1Re0
|
|
|
+ SETL [A0.2++], D0Ar6, D1Ar5
|
|
|
+ BR $Laligned_32
|
|
|
+
|
|
|
+! If there are any remaining bytes use the byte copy loop, otherwise we are done
|
|
|
+ ANDS D1Ar3, D1Ar3, #0x1f
|
|
|
+ BNZ $Lbyte_copy
|
|
|
+ B $Lend
|
|
|
+
|
|
|
+! The destination is 8 byte aligned but the source is not, and there are 8
|
|
|
+! or more bytes to be copied.
|
|
|
+$Lunaligned_copy:
|
|
|
+! Adjust the source pointer (A1.2) to the 8 byte boundary before its
|
|
|
+! current value
|
|
|
+ MOV D0Ar4, A1.2
|
|
|
+ MOV D0Ar6, A1.2
|
|
|
+ ANDMB D0Ar4, D0Ar4, #0xfff8
|
|
|
+ MOV A1.2, D0Ar4
|
|
|
+! Save the number of bytes of mis-alignment in D0Ar4 for use later
|
|
|
+ SUBS D0Ar6, D0Ar6, D0Ar4
|
|
|
+ MOV D0Ar4, D0Ar6
|
|
|
+! if there is no mis-alignment after all, use the aligned copy loop
|
|
|
+ BZ $Laligned_copy
|
|
|
+
|
|
|
+! prefetch 8 bytes
|
|
|
+ GETL D0Re0, D1Re0, [A1.2]
|
|
|
+
|
|
|
+ SUB TXRPT, D1Ar5, #1
|
|
|
+
|
|
|
+! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
|
|
|
+! 4 bytes, and more than 4 bytes.
|
|
|
+ CMP D0Ar6, #4
|
|
|
+ BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop
|
|
|
+ BZ $Lunaligned_4 ! use 4 byte mis-alignment loop
|
|
|
+
|
|
|
+! The mis-alignment is more than 4 bytes
|
|
|
+$Lunaligned_5_6_7:
|
|
|
+ SUB D0Ar6, D0Ar6, #4
|
|
|
+! Calculate the bit offsets required for the shift operations necesssary
|
|
|
+! to align the data.
|
|
|
+! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
|
|
|
+ MULW D0Ar6, D0Ar6, #8
|
|
|
+ MOV D1Ar5, #32
|
|
|
+ SUB D1Ar5, D1Ar5, D0Ar6
|
|
|
+! Move data 4 bytes before we enter the main loop
|
|
|
+ MOV D0Re0, D1Re0
|
|
|
+
|
|
|
+$Lloop_5_6_7:
|
|
|
+ GETL D0Ar2, D1Ar1, [++A1.2]
|
|
|
+! form 64-bit data in D0Re0, D1Re0
|
|
|
+ LSR D0Re0, D0Re0, D0Ar6
|
|
|
+ MOV D1Re0, D0Ar2
|
|
|
+ LSL D1Re0, D1Re0, D1Ar5
|
|
|
+ ADD D0Re0, D0Re0, D1Re0
|
|
|
+
|
|
|
+ LSR D0Ar2, D0Ar2, D0Ar6
|
|
|
+ LSL D1Re0, D1Ar1, D1Ar5
|
|
|
+ ADD D1Re0, D1Re0, D0Ar2
|
|
|
+
|
|
|
+ SETL [A0.2++], D0Re0, D1Re0
|
|
|
+ MOV D0Re0, D1Ar1
|
|
|
+ BR $Lloop_5_6_7
|
|
|
+
|
|
|
+ B $Lunaligned_end
|
|
|
+
|
|
|
+$Lunaligned_1_2_3:
|
|
|
+! Calculate the bit offsets required for the shift operations necesssary
|
|
|
+! to align the data.
|
|
|
+! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
|
|
|
+ MULW D0Ar6, D0Ar6, #8
|
|
|
+ MOV D1Ar5, #32
|
|
|
+ SUB D1Ar5, D1Ar5, D0Ar6
|
|
|
+
|
|
|
+$Lloop_1_2_3:
|
|
|
+! form 64-bit data in D0Re0,D1Re0
|
|
|
+ LSR D0Re0, D0Re0, D0Ar6
|
|
|
+ LSL D1Ar1, D1Re0, D1Ar5
|
|
|
+ ADD D0Re0, D0Re0, D1Ar1
|
|
|
+ MOV D0Ar2, D1Re0
|
|
|
+ LSR D0FrT, D0Ar2, D0Ar6
|
|
|
+ GETL D0Ar2, D1Ar1, [++A1.2]
|
|
|
+
|
|
|
+ MOV D1Re0, D0Ar2
|
|
|
+ LSL D1Re0, D1Re0, D1Ar5
|
|
|
+ ADD D1Re0, D1Re0, D0FrT
|
|
|
+
|
|
|
+ SETL [A0.2++], D0Re0, D1Re0
|
|
|
+ MOV D0Re0, D0Ar2
|
|
|
+ MOV D1Re0, D1Ar1
|
|
|
+ BR $Lloop_1_2_3
|
|
|
+
|
|
|
+ B $Lunaligned_end
|
|
|
+
|
|
|
+! The 4 byte mis-alignment case - this does not require any shifting, just a
|
|
|
+! shuffling of registers.
|
|
|
+$Lunaligned_4:
|
|
|
+ MOV D0Re0, D1Re0
|
|
|
+$Lloop_4:
|
|
|
+ GETL D0Ar2, D1Ar1, [++A1.2]
|
|
|
+ MOV D1Re0, D0Ar2
|
|
|
+ SETL [A0.2++], D0Re0, D1Re0
|
|
|
+ MOV D0Re0, D1Ar1
|
|
|
+ BR $Lloop_4
|
|
|
+
|
|
|
+$Lunaligned_end:
|
|
|
+! If there are no remaining bytes to copy, we are done.
|
|
|
+ ANDS D1Ar3, D1Ar3, #7
|
|
|
+ BZ $Lend
|
|
|
+! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
|
|
|
+! address of the remaining bytes, and fall through to the byte copy loop.
|
|
|
+ MOV D0Ar6, A1.2
|
|
|
+ ADD D1Ar5, D0Ar4, D0Ar6
|
|
|
+ MOV A1.2, D1Ar5
|
|
|
+ B $Lbyte_copy
|
|
|
+
|
|
|
+ .size _memcpy,.-_memcpy
|