csum-copy.S 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. /*
  2. * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  3. *
  4. * This file is subject to the terms and conditions of the GNU General Public
  5. * License. See the file COPYING in the main directory of this archive
  6. * for more details. No warranty for anything given at all.
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/errno.h>
  10. /*
  11. * Checksum copy with exception handling.
  12. * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
  13. * destination is zeroed.
  14. *
  15. * Input
  16. * rdi source
  17. * rsi destination
  18. * edx len (32bit)
  19. * ecx sum (32bit)
  20. * r8 src_err_ptr (int)
  21. * r9 dst_err_ptr (int)
  22. *
  23. * Output
  24. * eax 64bit sum. undefined in case of exception.
  25. *
  26. * Wrappers need to take care of valid exception sum and zeroing.
  27. * They also should align source or destination to 8 bytes.
  28. */
  29. .macro source
  30. 10:
  31. .section __ex_table,"a"
  32. .align 8
  33. .quad 10b,.Lbad_source
  34. .previous
  35. .endm
  36. .macro dest
  37. 20:
  38. .section __ex_table,"a"
  39. .align 8
  40. .quad 20b,.Lbad_dest
  41. .previous
  42. .endm
  43. .macro ignore L=.Lignore
  44. 30:
  45. .section __ex_table,"a"
  46. .align 8
  47. .quad 30b,\L
  48. .previous
  49. .endm
  50. .globl csum_partial_copy_generic
  51. .p2align 4
  52. csum_partial_copy_generic:
  53. cmpl $3*64,%edx
  54. jle .Lignore
  55. .Lignore:
  56. subq $7*8,%rsp
  57. movq %rbx,2*8(%rsp)
  58. movq %r12,3*8(%rsp)
  59. movq %r14,4*8(%rsp)
  60. movq %r13,5*8(%rsp)
  61. movq %rbp,6*8(%rsp)
  62. movq %r8,(%rsp)
  63. movq %r9,1*8(%rsp)
  64. movl %ecx,%eax
  65. movl %edx,%ecx
  66. xorl %r9d,%r9d
  67. movq %rcx,%r12
  68. shrq $6,%r12
  69. jz .Lhandle_tail /* < 64 */
  70. clc
  71. /* main loop. clear in 64 byte blocks */
  72. /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
  73. /* r11: temp3, rdx: temp4, r12 loopcnt */
  74. /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
  75. .p2align 4
  76. .Lloop:
  77. source
  78. movq (%rdi),%rbx
  79. source
  80. movq 8(%rdi),%r8
  81. source
  82. movq 16(%rdi),%r11
  83. source
  84. movq 24(%rdi),%rdx
  85. source
  86. movq 32(%rdi),%r10
  87. source
  88. movq 40(%rdi),%rbp
  89. source
  90. movq 48(%rdi),%r14
  91. source
  92. movq 56(%rdi),%r13
  93. ignore 2f
  94. prefetcht0 5*64(%rdi)
  95. 2:
  96. adcq %rbx,%rax
  97. adcq %r8,%rax
  98. adcq %r11,%rax
  99. adcq %rdx,%rax
  100. adcq %r10,%rax
  101. adcq %rbp,%rax
  102. adcq %r14,%rax
  103. adcq %r13,%rax
  104. decl %r12d
  105. dest
  106. movq %rbx,(%rsi)
  107. dest
  108. movq %r8,8(%rsi)
  109. dest
  110. movq %r11,16(%rsi)
  111. dest
  112. movq %rdx,24(%rsi)
  113. dest
  114. movq %r10,32(%rsi)
  115. dest
  116. movq %rbp,40(%rsi)
  117. dest
  118. movq %r14,48(%rsi)
  119. dest
  120. movq %r13,56(%rsi)
  121. 3:
  122. leaq 64(%rdi),%rdi
  123. leaq 64(%rsi),%rsi
  124. jnz .Lloop
  125. adcq %r9,%rax
  126. /* do last upto 56 bytes */
  127. .Lhandle_tail:
  128. /* ecx: count */
  129. movl %ecx,%r10d
  130. andl $63,%ecx
  131. shrl $3,%ecx
  132. jz .Lfold
  133. clc
  134. .p2align 4
  135. .Lloop_8:
  136. source
  137. movq (%rdi),%rbx
  138. adcq %rbx,%rax
  139. decl %ecx
  140. dest
  141. movq %rbx,(%rsi)
  142. leaq 8(%rsi),%rsi /* preserve carry */
  143. leaq 8(%rdi),%rdi
  144. jnz .Lloop_8
  145. adcq %r9,%rax /* add in carry */
  146. .Lfold:
  147. /* reduce checksum to 32bits */
  148. movl %eax,%ebx
  149. shrq $32,%rax
  150. addl %ebx,%eax
  151. adcl %r9d,%eax
  152. /* do last upto 6 bytes */
  153. .Lhandle_7:
  154. movl %r10d,%ecx
  155. andl $7,%ecx
  156. shrl $1,%ecx
  157. jz .Lhandle_1
  158. movl $2,%edx
  159. xorl %ebx,%ebx
  160. clc
  161. .p2align 4
  162. .Lloop_1:
  163. source
  164. movw (%rdi),%bx
  165. adcl %ebx,%eax
  166. decl %ecx
  167. dest
  168. movw %bx,(%rsi)
  169. leaq 2(%rdi),%rdi
  170. leaq 2(%rsi),%rsi
  171. jnz .Lloop_1
  172. adcl %r9d,%eax /* add in carry */
  173. /* handle last odd byte */
  174. .Lhandle_1:
  175. testl $1,%r10d
  176. jz .Lende
  177. xorl %ebx,%ebx
  178. source
  179. movb (%rdi),%bl
  180. dest
  181. movb %bl,(%rsi)
  182. addl %ebx,%eax
  183. adcl %r9d,%eax /* carry */
  184. .Lende:
  185. movq 2*8(%rsp),%rbx
  186. movq 3*8(%rsp),%r12
  187. movq 4*8(%rsp),%r14
  188. movq 5*8(%rsp),%r13
  189. movq 6*8(%rsp),%rbp
  190. addq $7*8,%rsp
  191. ret
  192. /* Exception handlers. Very simple, zeroing is done in the wrappers */
  193. .Lbad_source:
  194. movq (%rsp),%rax
  195. testq %rax,%rax
  196. jz .Lende
  197. movl $-EFAULT,(%rax)
  198. jmp .Lende
  199. .Lbad_dest:
  200. movq 8(%rsp),%rax
  201. testq %rax,%rax
  202. jz .Lende
  203. movl $-EFAULT,(%rax)
  204. jmp .Lende