copy_user_nocache_64.S 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /* Copyright 2002 Andi Kleen, SuSE Labs.
  2. * Subject to the GNU Public License v2.
  3. *
  4. * Functions to copy from and to user space.
  5. */
  6. #include <linux/linkage.h>
  7. #include <asm/dwarf2.h>
  8. #define FIX_ALIGNMENT 1
  9. #include <asm/current.h>
  10. #include <asm/asm-offsets.h>
  11. #include <asm/thread_info.h>
  12. #include <asm/cpufeature.h>
  13. /*
  14. * copy_user_nocache - Uncached memory copy with exception handling
  15. * This will force destination/source out of cache for more performance.
  16. *
  17. * Input:
  18. * rdi destination
  19. * rsi source
  20. * rdx count
  21. * rcx zero flag when 1 zero on exception
  22. *
  23. * Output:
  24. * eax uncopied bytes or 0 if successful.
  25. */
  26. ENTRY(__copy_user_nocache)
  27. CFI_STARTPROC
  28. pushq %rbx
  29. CFI_ADJUST_CFA_OFFSET 8
  30. CFI_REL_OFFSET rbx, 0
  31. pushq %rcx /* save zero flag */
  32. CFI_ADJUST_CFA_OFFSET 8
  33. CFI_REL_OFFSET rcx, 0
  34. xorl %eax,%eax /* zero for the exception handler */
  35. #ifdef FIX_ALIGNMENT
  36. /* check for bad alignment of destination */
  37. movl %edi,%ecx
  38. andl $7,%ecx
  39. jnz .Lbad_alignment
  40. .Lafter_bad_alignment:
  41. #endif
  42. movq %rdx,%rcx
  43. movl $64,%ebx
  44. shrq $6,%rdx
  45. decq %rdx
  46. js .Lhandle_tail
  47. .p2align 4
  48. .Lloop:
  49. .Ls1: movq (%rsi),%r11
  50. .Ls2: movq 1*8(%rsi),%r8
  51. .Ls3: movq 2*8(%rsi),%r9
  52. .Ls4: movq 3*8(%rsi),%r10
  53. .Ld1: movnti %r11,(%rdi)
  54. .Ld2: movnti %r8,1*8(%rdi)
  55. .Ld3: movnti %r9,2*8(%rdi)
  56. .Ld4: movnti %r10,3*8(%rdi)
  57. .Ls5: movq 4*8(%rsi),%r11
  58. .Ls6: movq 5*8(%rsi),%r8
  59. .Ls7: movq 6*8(%rsi),%r9
  60. .Ls8: movq 7*8(%rsi),%r10
  61. .Ld5: movnti %r11,4*8(%rdi)
  62. .Ld6: movnti %r8,5*8(%rdi)
  63. .Ld7: movnti %r9,6*8(%rdi)
  64. .Ld8: movnti %r10,7*8(%rdi)
  65. dec %rdx
  66. leaq 64(%rsi),%rsi
  67. leaq 64(%rdi),%rdi
  68. jns .Lloop
  69. .p2align 4
  70. .Lhandle_tail:
  71. movl %ecx,%edx
  72. andl $63,%ecx
  73. shrl $3,%ecx
  74. jz .Lhandle_7
  75. movl $8,%ebx
  76. .p2align 4
  77. .Lloop_8:
  78. .Ls9: movq (%rsi),%r8
  79. .Ld9: movnti %r8,(%rdi)
  80. decl %ecx
  81. leaq 8(%rdi),%rdi
  82. leaq 8(%rsi),%rsi
  83. jnz .Lloop_8
  84. .Lhandle_7:
  85. movl %edx,%ecx
  86. andl $7,%ecx
  87. jz .Lende
  88. .p2align 4
  89. .Lloop_1:
  90. .Ls10: movb (%rsi),%bl
  91. .Ld10: movb %bl,(%rdi)
  92. incq %rdi
  93. incq %rsi
  94. decl %ecx
  95. jnz .Lloop_1
  96. CFI_REMEMBER_STATE
  97. .Lende:
  98. popq %rcx
  99. CFI_ADJUST_CFA_OFFSET -8
  100. CFI_RESTORE %rcx
  101. popq %rbx
  102. CFI_ADJUST_CFA_OFFSET -8
  103. CFI_RESTORE rbx
  104. sfence
  105. ret
  106. CFI_RESTORE_STATE
  107. #ifdef FIX_ALIGNMENT
  108. /* align destination */
  109. .p2align 4
  110. .Lbad_alignment:
  111. movl $8,%r9d
  112. subl %ecx,%r9d
  113. movl %r9d,%ecx
  114. cmpq %r9,%rdx
  115. jz .Lhandle_7
  116. js .Lhandle_7
  117. .Lalign_1:
  118. .Ls11: movb (%rsi),%bl
  119. .Ld11: movb %bl,(%rdi)
  120. incq %rsi
  121. incq %rdi
  122. decl %ecx
  123. jnz .Lalign_1
  124. subq %r9,%rdx
  125. jmp .Lafter_bad_alignment
  126. #endif
  127. /* table sorted by exception address */
  128. .section __ex_table,"a"
  129. .align 8
  130. .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */
  131. .quad .Ls2,.Ls1e
  132. .quad .Ls3,.Ls1e
  133. .quad .Ls4,.Ls1e
  134. .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */
  135. .quad .Ld2,.Ls2e
  136. .quad .Ld3,.Ls3e
  137. .quad .Ld4,.Ls4e
  138. .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */
  139. .quad .Ls6,.Ls5e
  140. .quad .Ls7,.Ls5e
  141. .quad .Ls8,.Ls5e
  142. .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */
  143. .quad .Ld6,.Ls6e
  144. .quad .Ld7,.Ls7e
  145. .quad .Ld8,.Ls8e
  146. .quad .Ls9,.Le_quad
  147. .quad .Ld9,.Le_quad
  148. .quad .Ls10,.Le_byte
  149. .quad .Ld10,.Le_byte
  150. #ifdef FIX_ALIGNMENT
  151. .quad .Ls11,.Lzero_rest
  152. .quad .Ld11,.Lzero_rest
  153. #endif
  154. .quad .Le5,.Le_zero
  155. .previous
  156. /* eax: zero, ebx: 64 */
  157. .Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
  158. .Ls2e: addl $8,%eax
  159. .Ls3e: addl $8,%eax
  160. .Ls4e: addl $8,%eax
  161. .Ls5e: addl $8,%eax
  162. .Ls6e: addl $8,%eax
  163. .Ls7e: addl $8,%eax
  164. .Ls8e: addl $8,%eax
  165. addq %rbx,%rdi /* +64 */
  166. subq %rax,%rdi /* correct destination with computed offset */
  167. shlq $6,%rdx /* loop counter * 64 (stride length) */
  168. addq %rax,%rdx /* add offset to loopcnt */
  169. andl $63,%ecx /* remaining bytes */
  170. addq %rcx,%rdx /* add them */
  171. jmp .Lzero_rest
  172. /* exception on quad word loop in tail handling */
  173. /* ecx: loopcnt/8, %edx: length, rdi: correct */
  174. .Le_quad:
  175. shll $3,%ecx
  176. andl $7,%edx
  177. addl %ecx,%edx
  178. /* edx: bytes to zero, rdi: dest, eax:zero */
  179. .Lzero_rest:
  180. cmpl $0,(%rsp) /* zero flag set? */
  181. jz .Le_zero
  182. movq %rdx,%rcx
  183. .Le_byte:
  184. xorl %eax,%eax
  185. .Le5: rep
  186. stosb
  187. /* when there is another exception while zeroing the rest just return */
  188. .Le_zero:
  189. movq %rdx,%rax
  190. jmp .Lende
  191. CFI_ENDPROC
  192. ENDPROC(__copy_user_nocache)