memset_64.S 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. /* Copyright 2002 Andi Kleen, SuSE Labs */
  2. #include <linux/linkage.h>
  3. #include <asm/dwarf2.h>
  4. #include <asm/cpufeature.h>
  5. #include <asm/alternative-asm.h>
  6. /*
  7. * ISO C memset - set a memory block to a byte value. This function uses fast
  8. * string to get better performance than the original function. The code is
  9. * simpler and shorter than the orignal function as well.
  10. *
  11. * rdi destination
  12. * rsi value (char)
  13. * rdx count (bytes)
  14. *
  15. * rax original destination
  16. */
  17. .section .altinstr_replacement, "ax", @progbits
  18. .Lmemset_c:
  19. movq %rdi,%r9
  20. movl %edx,%r8d
  21. andl $7,%r8d
  22. movl %edx,%ecx
  23. shrl $3,%ecx
  24. /* expand byte value */
  25. movzbl %sil,%esi
  26. movabs $0x0101010101010101,%rax
  27. mulq %rsi /* with rax, clobbers rdx */
  28. rep stosq
  29. movl %r8d,%ecx
  30. rep stosb
  31. movq %r9,%rax
  32. ret
  33. .Lmemset_e:
  34. .previous
  35. /*
  36. * ISO C memset - set a memory block to a byte value. This function uses
  37. * enhanced rep stosb to override the fast string function.
  38. * The code is simpler and shorter than the fast string function as well.
  39. *
  40. * rdi destination
  41. * rsi value (char)
  42. * rdx count (bytes)
  43. *
  44. * rax original destination
  45. */
  46. .section .altinstr_replacement, "ax", @progbits
  47. .Lmemset_c_e:
  48. movq %rdi,%r9
  49. movb %sil,%al
  50. movl %edx,%ecx
  51. rep stosb
  52. movq %r9,%rax
  53. ret
  54. .Lmemset_e_e:
  55. .previous
  56. ENTRY(memset)
  57. ENTRY(__memset)
  58. CFI_STARTPROC
  59. movq %rdi,%r10
  60. movq %rdx,%r11
  61. /* expand byte value */
  62. movzbl %sil,%ecx
  63. movabs $0x0101010101010101,%rax
  64. mul %rcx /* with rax, clobbers rdx */
  65. /* align dst */
  66. movl %edi,%r9d
  67. andl $7,%r9d
  68. jnz .Lbad_alignment
  69. CFI_REMEMBER_STATE
  70. .Lafter_bad_alignment:
  71. movl %r11d,%ecx
  72. shrl $6,%ecx
  73. jz .Lhandle_tail
  74. .p2align 4
  75. .Lloop_64:
  76. decl %ecx
  77. movq %rax,(%rdi)
  78. movq %rax,8(%rdi)
  79. movq %rax,16(%rdi)
  80. movq %rax,24(%rdi)
  81. movq %rax,32(%rdi)
  82. movq %rax,40(%rdi)
  83. movq %rax,48(%rdi)
  84. movq %rax,56(%rdi)
  85. leaq 64(%rdi),%rdi
  86. jnz .Lloop_64
  87. /* Handle tail in loops. The loops should be faster than hard
  88. to predict jump tables. */
  89. .p2align 4
  90. .Lhandle_tail:
  91. movl %r11d,%ecx
  92. andl $63&(~7),%ecx
  93. jz .Lhandle_7
  94. shrl $3,%ecx
  95. .p2align 4
  96. .Lloop_8:
  97. decl %ecx
  98. movq %rax,(%rdi)
  99. leaq 8(%rdi),%rdi
  100. jnz .Lloop_8
  101. .Lhandle_7:
  102. movl %r11d,%ecx
  103. andl $7,%ecx
  104. jz .Lende
  105. .p2align 4
  106. .Lloop_1:
  107. decl %ecx
  108. movb %al,(%rdi)
  109. leaq 1(%rdi),%rdi
  110. jnz .Lloop_1
  111. .Lende:
  112. movq %r10,%rax
  113. ret
  114. CFI_RESTORE_STATE
  115. .Lbad_alignment:
  116. cmpq $7,%r11
  117. jbe .Lhandle_7
  118. movq %rax,(%rdi) /* unaligned store */
  119. movq $8,%r8
  120. subq %r9,%r8
  121. addq %r8,%rdi
  122. subq %r8,%r11
  123. jmp .Lafter_bad_alignment
  124. .Lfinal:
  125. CFI_ENDPROC
  126. ENDPROC(memset)
  127. ENDPROC(__memset)
  128. /* Some CPUs support enhanced REP MOVSB/STOSB feature.
  129. * It is recommended to use this when possible.
  130. *
  131. * If enhanced REP MOVSB/STOSB feature is not available, use fast string
  132. * instructions.
  133. *
  134. * Otherwise, use original memset function.
  135. *
  136. * In .altinstructions section, ERMS feature is placed after REG_GOOD
  137. * feature to implement the right patch order.
  138. */
  139. .section .altinstructions,"a"
  140. altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
  141. .Lfinal-memset,.Lmemset_e-.Lmemset_c
  142. altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
  143. .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
  144. .previous