copy_page.S 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
  2. /* Don't use streaming store because it's better when the target
  3. ends up in cache. */
  4. /* Could vary the prefetch distance based on SMP/UP */
  5. .globl copy_page
  6. .p2align 4
  7. copy_page:
  8. subq $3*8,%rsp
  9. movq %rbx,(%rsp)
  10. movq %r12,1*8(%rsp)
  11. movq %r13,2*8(%rsp)
  12. movl $(4096/64)-5,%ecx
  13. .p2align 4
  14. .Loop64:
  15. dec %rcx
  16. movq (%rsi), %rax
  17. movq 8 (%rsi), %rbx
  18. movq 16 (%rsi), %rdx
  19. movq 24 (%rsi), %r8
  20. movq 32 (%rsi), %r9
  21. movq 40 (%rsi), %r10
  22. movq 48 (%rsi), %r11
  23. movq 56 (%rsi), %r12
  24. prefetcht0 5*64(%rsi)
  25. movq %rax, (%rdi)
  26. movq %rbx, 8 (%rdi)
  27. movq %rdx, 16 (%rdi)
  28. movq %r8, 24 (%rdi)
  29. movq %r9, 32 (%rdi)
  30. movq %r10, 40 (%rdi)
  31. movq %r11, 48 (%rdi)
  32. movq %r12, 56 (%rdi)
  33. leaq 64 (%rsi), %rsi
  34. leaq 64 (%rdi), %rdi
  35. jnz .Loop64
  36. movl $5,%ecx
  37. .p2align 4
  38. .Loop2:
  39. decl %ecx
  40. movq (%rsi), %rax
  41. movq 8 (%rsi), %rbx
  42. movq 16 (%rsi), %rdx
  43. movq 24 (%rsi), %r8
  44. movq 32 (%rsi), %r9
  45. movq 40 (%rsi), %r10
  46. movq 48 (%rsi), %r11
  47. movq 56 (%rsi), %r12
  48. movq %rax, (%rdi)
  49. movq %rbx, 8 (%rdi)
  50. movq %rdx, 16 (%rdi)
  51. movq %r8, 24 (%rdi)
  52. movq %r9, 32 (%rdi)
  53. movq %r10, 40 (%rdi)
  54. movq %r11, 48 (%rdi)
  55. movq %r12, 56 (%rdi)
  56. leaq 64(%rdi),%rdi
  57. leaq 64(%rsi),%rsi
  58. jnz .Loop2
  59. movq (%rsp),%rbx
  60. movq 1*8(%rsp),%r12
  61. movq 2*8(%rsp),%r13
  62. addq $3*8,%rsp
  63. ret
  64. /* C stepping K8 run faster using the string copy instructions.
  65. It is also a lot simpler. Use this when possible */
  66. #include <asm/cpufeature.h>
  67. .section .altinstructions,"a"
  68. .align 8
  69. .quad copy_page
  70. .quad copy_page_c
  71. .byte X86_FEATURE_K8_C
  72. .byte copy_page_c_end-copy_page_c
  73. .byte copy_page_c_end-copy_page_c
  74. .previous
  75. .section .altinstr_replacement,"ax"
  76. copy_page_c:
  77. movl $4096/8,%ecx
  78. rep
  79. movsq
  80. ret
  81. copy_page_c_end:
  82. .previous