memset.S 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. /* Copyright 2002 Andi Kleen, SuSE Labs */
  2. #include <linux/config.h>
  3. #include <linux/linkage.h>
  4. #include <asm/dwarf2.h>
  5. /*
  6. * ISO C memset - set a memory block to a byte value.
  7. *
  8. * rdi destination
  9. * rsi value (char)
  10. * rdx count (bytes)
  11. *
  12. * rax original destination
  13. */
  14. ALIGN
  15. memset_c:
  16. CFI_STARTPROC
  17. movq %rdi,%r9
  18. movl %edx,%r8d
  19. andl $7,%r8d
  20. movl %edx,%ecx
  21. shrl $3,%ecx
  22. /* expand byte value */
  23. movzbl %sil,%esi
  24. movabs $0x0101010101010101,%rax
  25. mulq %rsi /* with rax, clobbers rdx */
  26. rep stosq
  27. movl %r8d,%ecx
  28. rep stosb
  29. movq %r9,%rax
  30. ret
  31. CFI_ENDPROC
  32. ENDPROC(memset_c)
  33. ENTRY(memset)
  34. ENTRY(__memset)
  35. CFI_STARTPROC
  36. movq %rdi,%r10
  37. movq %rdx,%r11
  38. /* expand byte value */
  39. movzbl %sil,%ecx
  40. movabs $0x0101010101010101,%rax
  41. mul %rcx /* with rax, clobbers rdx */
  42. /* align dst */
  43. movl %edi,%r9d
  44. andl $7,%r9d
  45. jnz .Lbad_alignment
  46. CFI_REMEMBER_STATE
  47. .Lafter_bad_alignment:
  48. movl %r11d,%ecx
  49. shrl $6,%ecx
  50. jz .Lhandle_tail
  51. .p2align 4
  52. .Lloop_64:
  53. decl %ecx
  54. movq %rax,(%rdi)
  55. movq %rax,8(%rdi)
  56. movq %rax,16(%rdi)
  57. movq %rax,24(%rdi)
  58. movq %rax,32(%rdi)
  59. movq %rax,40(%rdi)
  60. movq %rax,48(%rdi)
  61. movq %rax,56(%rdi)
  62. leaq 64(%rdi),%rdi
  63. jnz .Lloop_64
  64. /* Handle tail in loops. The loops should be faster than hard
  65. to predict jump tables. */
  66. .p2align 4
  67. .Lhandle_tail:
  68. movl %r11d,%ecx
  69. andl $63&(~7),%ecx
  70. jz .Lhandle_7
  71. shrl $3,%ecx
  72. .p2align 4
  73. .Lloop_8:
  74. decl %ecx
  75. movq %rax,(%rdi)
  76. leaq 8(%rdi),%rdi
  77. jnz .Lloop_8
  78. .Lhandle_7:
  79. movl %r11d,%ecx
  80. andl $7,%ecx
  81. jz .Lende
  82. .p2align 4
  83. .Lloop_1:
  84. decl %ecx
  85. movb %al,(%rdi)
  86. leaq 1(%rdi),%rdi
  87. jnz .Lloop_1
  88. .Lende:
  89. movq %r10,%rax
  90. ret
  91. CFI_RESTORE_STATE
  92. .Lbad_alignment:
  93. cmpq $7,%r11
  94. jbe .Lhandle_7
  95. movq %rax,(%rdi) /* unaligned store */
  96. movq $8,%r8
  97. subq %r9,%r8
  98. addq %r8,%rdi
  99. subq %r8,%r11
  100. jmp .Lafter_bad_alignment
  101. .Lfinal:
  102. CFI_ENDPROC
  103. ENDPROC(memset)
  104. ENDPROC(__memset)
  105. /* Some CPUs run faster using the string instructions.
  106. It is also a lot simpler. Use this when possible */
  107. #include <asm/cpufeature.h>
  108. .section .altinstr_replacement,"ax"
  109. 1: .byte 0xeb /* jmp <disp8> */
  110. .byte (memset_c - memset) - (2f - 1b) /* offset */
  111. 2:
  112. .previous
  113. .section .altinstructions,"a"
  114. .align 8
  115. .quad memset
  116. .quad 1b
  117. .byte X86_FEATURE_REP_GOOD
  118. .byte .Lfinal - memset
  119. .byte 2b - 1b
  120. .previous