copy_user_nocache_64.S 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /* Copyright 2002 Andi Kleen, SuSE Labs.
  2. * Subject to the GNU Public License v2.
  3. *
  4. * Functions to copy from and to user space.
  5. */
  6. #include <linux/linkage.h>
  7. #include <asm/dwarf2.h>
  8. #define FIX_ALIGNMENT 1
  9. #include <asm/current.h>
  10. #include <asm/asm-offsets.h>
  11. #include <asm/thread_info.h>
  12. #include <asm/cpufeature.h>
  13. /*
  14. * copy_user_nocache - Uncached memory copy with exception handling
  15. * This will force destination/source out of cache for more performance.
  16. *
  17. * Input:
  18. * rdi destination
  19. * rsi source
  20. * rdx count
  21. * rcx zero flag when 1 zero on exception
  22. *
  23. * Output:
  24. * eax uncopied bytes or 0 if successful.
  25. */
  26. ENTRY(__copy_user_nocache)
  27. CFI_STARTPROC
  28. pushq %rbx
  29. CFI_ADJUST_CFA_OFFSET 8
  30. CFI_REL_OFFSET rbx, 0
  31. pushq %rcx /* save zero flag */
  32. CFI_ADJUST_CFA_OFFSET 8
  33. CFI_REL_OFFSET rcx, 0
  34. xorl %eax,%eax /* zero for the exception handler */
  35. #ifdef FIX_ALIGNMENT
  36. /* check for bad alignment of destination */
  37. movl %edi,%ecx
  38. andl $7,%ecx
  39. jnz .Lbad_alignment
  40. .Lafter_bad_alignment:
  41. #endif
  42. movq %rdx,%rcx
  43. movl $64,%ebx
  44. shrq $6,%rdx
  45. decq %rdx
  46. js .Lhandle_tail
  47. .p2align 4
  48. .Lloop:
  49. .Ls1: movq (%rsi),%r11
  50. .Ls2: movq 1*8(%rsi),%r8
  51. .Ls3: movq 2*8(%rsi),%r9
  52. .Ls4: movq 3*8(%rsi),%r10
  53. .Ld1: movnti %r11,(%rdi)
  54. .Ld2: movnti %r8,1*8(%rdi)
  55. .Ld3: movnti %r9,2*8(%rdi)
  56. .Ld4: movnti %r10,3*8(%rdi)
  57. .Ls5: movq 4*8(%rsi),%r11
  58. .Ls6: movq 5*8(%rsi),%r8
  59. .Ls7: movq 6*8(%rsi),%r9
  60. .Ls8: movq 7*8(%rsi),%r10
  61. .Ld5: movnti %r11,4*8(%rdi)
  62. .Ld6: movnti %r8,5*8(%rdi)
  63. .Ld7: movnti %r9,6*8(%rdi)
  64. .Ld8: movnti %r10,7*8(%rdi)
  65. dec %rdx
  66. leaq 64(%rsi),%rsi
  67. leaq 64(%rdi),%rdi
  68. jns .Lloop
  69. .p2align 4
  70. .Lhandle_tail:
  71. movl %ecx,%edx
  72. andl $63,%ecx
  73. shrl $3,%ecx
  74. jz .Lhandle_7
  75. movl $8,%ebx
  76. .p2align 4
  77. .Lloop_8:
  78. .Ls9: movq (%rsi),%r8
  79. .Ld9: movnti %r8,(%rdi)
  80. decl %ecx
  81. leaq 8(%rdi),%rdi
  82. leaq 8(%rsi),%rsi
  83. jnz .Lloop_8
  84. .Lhandle_7:
  85. movl %edx,%ecx
  86. andl $7,%ecx
  87. jz .Lende
  88. .p2align 4
  89. .Lloop_1:
  90. .Ls10: movb (%rsi),%bl
  91. .Ld10: movb %bl,(%rdi)
  92. incq %rdi
  93. incq %rsi
  94. decl %ecx
  95. jnz .Lloop_1
  96. CFI_REMEMBER_STATE
  97. .Lende:
  98. popq %rcx
  99. CFI_ADJUST_CFA_OFFSET -8
  100. CFI_RESTORE %rcx
  101. popq %rbx
  102. CFI_ADJUST_CFA_OFFSET -8
  103. CFI_RESTORE rbx
  104. sfence
  105. ret
  106. CFI_RESTORE_STATE
  107. #ifdef FIX_ALIGNMENT
  108. /* align destination */
  109. .p2align 4
  110. .Lbad_alignment:
  111. movl $8,%r9d
  112. subl %ecx,%r9d
  113. movl %r9d,%ecx
  114. cmpq %r9,%rdx
  115. jz .Lhandle_7
  116. js .Lhandle_7
  117. .Lalign_1:
  118. .Ls11: movb (%rsi),%bl
  119. .Ld11: movb %bl,(%rdi)
  120. incq %rsi
  121. incq %rdi
  122. decl %ecx
  123. jnz .Lalign_1
  124. subq %r9,%rdx
  125. jmp .Lafter_bad_alignment
  126. #endif
  127. /* table sorted by exception address */
  128. .section __ex_table,"a"
  129. .align 8
  130. .quad .Ls1,.Ls1e
  131. .quad .Ls2,.Ls2e
  132. .quad .Ls3,.Ls3e
  133. .quad .Ls4,.Ls4e
  134. .quad .Ld1,.Ls1e
  135. .quad .Ld2,.Ls2e
  136. .quad .Ld3,.Ls3e
  137. .quad .Ld4,.Ls4e
  138. .quad .Ls5,.Ls5e
  139. .quad .Ls6,.Ls6e
  140. .quad .Ls7,.Ls7e
  141. .quad .Ls8,.Ls8e
  142. .quad .Ld5,.Ls5e
  143. .quad .Ld6,.Ls6e
  144. .quad .Ld7,.Ls7e
  145. .quad .Ld8,.Ls8e
  146. .quad .Ls9,.Le_quad
  147. .quad .Ld9,.Le_quad
  148. .quad .Ls10,.Le_byte
  149. .quad .Ld10,.Le_byte
  150. #ifdef FIX_ALIGNMENT
  151. .quad .Ls11,.Lzero_rest
  152. .quad .Ld11,.Lzero_rest
  153. #endif
  154. .quad .Le5,.Le_zero
  155. .previous
  156. /* compute 64-offset for main loop. 8 bytes accuracy with error on the
  157. pessimistic side. this is gross. it would be better to fix the
  158. interface. */
  159. /* eax: zero, ebx: 64 */
  160. .Ls1e: addl $8,%eax
  161. .Ls2e: addl $8,%eax
  162. .Ls3e: addl $8,%eax
  163. .Ls4e: addl $8,%eax
  164. .Ls5e: addl $8,%eax
  165. .Ls6e: addl $8,%eax
  166. .Ls7e: addl $8,%eax
  167. .Ls8e: addl $8,%eax
  168. addq %rbx,%rdi /* +64 */
  169. subq %rax,%rdi /* correct destination with computed offset */
  170. shlq $6,%rdx /* loop counter * 64 (stride length) */
  171. addq %rax,%rdx /* add offset to loopcnt */
  172. andl $63,%ecx /* remaining bytes */
  173. addq %rcx,%rdx /* add them */
  174. jmp .Lzero_rest
  175. /* exception on quad word loop in tail handling */
  176. /* ecx: loopcnt/8, %edx: length, rdi: correct */
  177. .Le_quad:
  178. shll $3,%ecx
  179. andl $7,%edx
  180. addl %ecx,%edx
  181. /* edx: bytes to zero, rdi: dest, eax:zero */
  182. .Lzero_rest:
  183. cmpl $0,(%rsp) /* zero flag set? */
  184. jz .Le_zero
  185. movq %rdx,%rcx
  186. .Le_byte:
  187. xorl %eax,%eax
  188. .Le5: rep
  189. stosb
  190. /* when there is another exception while zeroing the rest just return */
  191. .Le_zero:
  192. movq %rdx,%rax
  193. jmp .Lende
  194. CFI_ENDPROC
  195. ENDPROC(__copy_user_nocache)