memset_64.S 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. /* Copyright 2002 Andi Kleen, SuSE Labs */
  2. #include <linux/linkage.h>
  3. #include <asm/dwarf2.h>
  4. /*
  5. * ISO C memset - set a memory block to a byte value.
  6. *
  7. * rdi destination
  8. * rsi value (char)
  9. * rdx count (bytes)
  10. *
  11. * rax original destination
  12. */
  13. ALIGN
  14. memset_c:
  15. CFI_STARTPROC
  16. movq %rdi,%r9
  17. movl %edx,%r8d
  18. andl $7,%r8d
  19. movl %edx,%ecx
  20. shrl $3,%ecx
  21. /* expand byte value */
  22. movzbl %sil,%esi
  23. movabs $0x0101010101010101,%rax
  24. mulq %rsi /* with rax, clobbers rdx */
  25. rep stosq
  26. movl %r8d,%ecx
  27. rep stosb
  28. movq %r9,%rax
  29. ret
  30. CFI_ENDPROC
  31. ENDPROC(memset_c)
  32. ENTRY(memset)
  33. ENTRY(__memset)
  34. CFI_STARTPROC
  35. movq %rdi,%r10
  36. movq %rdx,%r11
  37. /* expand byte value */
  38. movzbl %sil,%ecx
  39. movabs $0x0101010101010101,%rax
  40. mul %rcx /* with rax, clobbers rdx */
  41. /* align dst */
  42. movl %edi,%r9d
  43. andl $7,%r9d
  44. jnz .Lbad_alignment
  45. CFI_REMEMBER_STATE
  46. .Lafter_bad_alignment:
  47. movl %r11d,%ecx
  48. shrl $6,%ecx
  49. jz .Lhandle_tail
  50. .p2align 4
  51. .Lloop_64:
  52. decl %ecx
  53. movq %rax,(%rdi)
  54. movq %rax,8(%rdi)
  55. movq %rax,16(%rdi)
  56. movq %rax,24(%rdi)
  57. movq %rax,32(%rdi)
  58. movq %rax,40(%rdi)
  59. movq %rax,48(%rdi)
  60. movq %rax,56(%rdi)
  61. leaq 64(%rdi),%rdi
  62. jnz .Lloop_64
  63. /* Handle tail in loops. The loops should be faster than hard
  64. to predict jump tables. */
  65. .p2align 4
  66. .Lhandle_tail:
  67. movl %r11d,%ecx
  68. andl $63&(~7),%ecx
  69. jz .Lhandle_7
  70. shrl $3,%ecx
  71. .p2align 4
  72. .Lloop_8:
  73. decl %ecx
  74. movq %rax,(%rdi)
  75. leaq 8(%rdi),%rdi
  76. jnz .Lloop_8
  77. .Lhandle_7:
  78. movl %r11d,%ecx
  79. andl $7,%ecx
  80. jz .Lende
  81. .p2align 4
  82. .Lloop_1:
  83. decl %ecx
  84. movb %al,(%rdi)
  85. leaq 1(%rdi),%rdi
  86. jnz .Lloop_1
  87. .Lende:
  88. movq %r10,%rax
  89. ret
  90. CFI_RESTORE_STATE
  91. .Lbad_alignment:
  92. cmpq $7,%r11
  93. jbe .Lhandle_7
  94. movq %rax,(%rdi) /* unaligned store */
  95. movq $8,%r8
  96. subq %r9,%r8
  97. addq %r8,%rdi
  98. subq %r8,%r11
  99. jmp .Lafter_bad_alignment
  100. .Lfinal:
  101. CFI_ENDPROC
  102. ENDPROC(memset)
  103. ENDPROC(__memset)
  104. /* Some CPUs run faster using the string instructions.
  105. It is also a lot simpler. Use this when possible */
  106. #include <asm/cpufeature.h>
  107. .section .altinstr_replacement,"ax"
  108. 1: .byte 0xeb /* jmp <disp8> */
  109. .byte (memset_c - memset) - (2f - 1b) /* offset */
  110. 2:
  111. .previous
  112. .section .altinstructions,"a"
  113. .align 8
  114. .quad memset
  115. .quad 1b
  116. .byte X86_FEATURE_REP_GOOD
  117. .byte .Lfinal - memset
  118. .byte 2b - 1b
  119. .previous