memset.S 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. /* Copyright 2002 Andi Kleen, SuSE Labs */
  2. /*
  3. * ISO C memset - set a memory block to a byte value.
  4. *
  5. * rdi destination
  6. * rsi value (char)
  7. * rdx count (bytes)
  8. *
  9. * rax original destination
  10. */
  11. .globl __memset
  12. .globl memset
  13. .p2align 4
  14. memset:
  15. __memset:
  16. movq %rdi,%r10
  17. movq %rdx,%r11
  18. /* expand byte value */
  19. movzbl %sil,%ecx
  20. movabs $0x0101010101010101,%rax
  21. mul %rcx /* with rax, clobbers rdx */
  22. /* align dst */
  23. movl %edi,%r9d
  24. andl $7,%r9d
  25. jnz .Lbad_alignment
  26. .Lafter_bad_alignment:
  27. movl %r11d,%ecx
  28. shrl $6,%ecx
  29. jz .Lhandle_tail
  30. .p2align 4
  31. .Lloop_64:
  32. decl %ecx
  33. movq %rax,(%rdi)
  34. movq %rax,8(%rdi)
  35. movq %rax,16(%rdi)
  36. movq %rax,24(%rdi)
  37. movq %rax,32(%rdi)
  38. movq %rax,40(%rdi)
  39. movq %rax,48(%rdi)
  40. movq %rax,56(%rdi)
  41. leaq 64(%rdi),%rdi
  42. jnz .Lloop_64
  43. /* Handle tail in loops. The loops should be faster than hard
  44. to predict jump tables. */
  45. .p2align 4
  46. .Lhandle_tail:
  47. movl %r11d,%ecx
  48. andl $63&(~7),%ecx
  49. jz .Lhandle_7
  50. shrl $3,%ecx
  51. .p2align 4
  52. .Lloop_8:
  53. decl %ecx
  54. movq %rax,(%rdi)
  55. leaq 8(%rdi),%rdi
  56. jnz .Lloop_8
  57. .Lhandle_7:
  58. movl %r11d,%ecx
  59. andl $7,%ecx
  60. jz .Lende
  61. .p2align 4
  62. .Lloop_1:
  63. decl %ecx
  64. movb %al,(%rdi)
  65. leaq 1(%rdi),%rdi
  66. jnz .Lloop_1
  67. .Lende:
  68. movq %r10,%rax
  69. ret
  70. .Lbad_alignment:
  71. cmpq $7,%r11
  72. jbe .Lhandle_7
  73. movq %rax,(%rdi) /* unaligned store */
  74. movq $8,%r8
  75. subq %r9,%r8
  76. addq %r8,%rdi
  77. subq %r8,%r11
  78. jmp .Lafter_bad_alignment
  79. /* Some CPUs run faster using the string instructions.
  80. It is also a lot simpler. Use this when possible */
  81. #include <asm/cpufeature.h>
  82. .section .altinstructions,"a"
  83. .align 8
  84. .quad memset
  85. .quad memset_c
  86. .byte X86_FEATURE_REP_GOOD
  87. .byte memset_c_end-memset_c
  88. .byte memset_c_end-memset_c
  89. .previous
  90. .section .altinstr_replacement,"ax"
  91. /* rdi destination
  92. * rsi value
  93. * rdx count
  94. */
  95. memset_c:
  96. movq %rdi,%r9
  97. movl %edx,%r8d
  98. andl $7,%r8d
  99. movl %edx,%ecx
  100. shrl $3,%ecx
  101. /* expand byte value */
  102. movzbl %sil,%esi
  103. movabs $0x0101010101010101,%rax
  104. mulq %rsi /* with rax, clobbers rdx */
  105. rep
  106. stosq
  107. movl %r8d,%ecx
  108. rep
  109. stosb
  110. movq %r9,%rax
  111. ret
  112. memset_c_end:
  113. .previous