crc32-pclmul_asm.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /* GPL HEADER START
  2. *
  3. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License version 2 only,
  7. * as published by the Free Software Foundation.
  8. *
  9. * This program is distributed in the hope that it will be useful, but
  10. * WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * General Public License version 2 for more details (a copy is included
  13. * in the LICENSE file that accompanied this code).
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * version 2 along with this program; If not, see http://www.gnu.org/licenses
  17. *
  18. * Please visit http://www.xyratex.com/contact if you need additional
  19. * information or have any questions.
  20. *
  21. * GPL HEADER END
  22. */
  23. /*
  24. * Copyright 2012 Xyratex Technology Limited
  25. *
  26. * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  27. * calculation.
  28. * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  29. * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  30. * at:
  31. * http://www.intel.com/products/processor/manuals/
  32. * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  33. * Volume 2B: Instruction Set Reference, N-Z
  34. *
  35. * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  36. * Alexander Boyko <Alexander_Boyko@xyratex.com>
  37. */
  38. #include <linux/linkage.h>
  39. #include <asm/inst.h>
  40. .align 16
  41. /*
  42. * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
  43. * #define CONSTANT_R1 0x154442bd4LL
  44. *
  45. * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
  46. * #define CONSTANT_R2 0x1c6e41596LL
  47. */
  48. .Lconstant_R2R1:
  49. .octa 0x00000001c6e415960000000154442bd4
  50. /*
  51. * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
  52. * #define CONSTANT_R3 0x1751997d0LL
  53. *
  54. * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
  55. * #define CONSTANT_R4 0x0ccaa009eLL
  56. */
  57. .Lconstant_R4R3:
  58. .octa 0x00000000ccaa009e00000001751997d0
  59. /*
  60. * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
  61. * #define CONSTANT_R5 0x163cd6124LL
  62. */
  63. .Lconstant_R5:
  64. .octa 0x00000000000000000000000163cd6124
  65. .Lconstant_mask32:
  66. .octa 0x000000000000000000000000FFFFFFFF
  67. /*
  68. * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  69. *
  70. * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
  71. * #define CONSTANT_RU 0x1F7011641LL
  72. */
  73. .Lconstant_RUpoly:
  74. .octa 0x00000001F701164100000001DB710641
  75. #define CONSTANT %xmm0
  76. #ifdef __x86_64__
  77. #define BUF %rdi
  78. #define LEN %rsi
  79. #define CRC %edx
  80. #else
  81. #warning Using 32bit code support
  82. #define BUF %eax
  83. #define LEN %edx
  84. #define CRC %ecx
  85. #endif
  86. .text
  87. /**
  88. * Calculate crc32
  89. * BUF - buffer (16 bytes aligned)
  90. * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
  91. * CRC - initial crc32
  92. * return %eax crc32
  93. * uint crc32_pclmul_le_16(unsigned char const *buffer,
  94. * size_t len, uint crc32)
  95. */
  96. .globl crc32_pclmul_le_16
  97. .align 4, 0x90
  98. crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
  99. movdqa (BUF), %xmm1
  100. movdqa 0x10(BUF), %xmm2
  101. movdqa 0x20(BUF), %xmm3
  102. movdqa 0x30(BUF), %xmm4
  103. movd CRC, CONSTANT
  104. pxor CONSTANT, %xmm1
  105. sub $0x40, LEN
  106. add $0x40, BUF
  107. #ifndef __x86_64__
  108. /* This is for position independent code(-fPIC) support for 32bit */
  109. call delta
  110. delta:
  111. pop %ecx
  112. #endif
  113. cmp $0x40, LEN
  114. jb less_64
  115. #ifdef __x86_64__
  116. movdqa .Lconstant_R2R1(%rip), CONSTANT
  117. #else
  118. movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
  119. #endif
  120. loop_64:/* 64 bytes Full cache line folding */
  121. prefetchnta 0x40(BUF)
  122. movdqa %xmm1, %xmm5
  123. movdqa %xmm2, %xmm6
  124. movdqa %xmm3, %xmm7
  125. #ifdef __x86_64__
  126. movdqa %xmm4, %xmm8
  127. #endif
  128. PCLMULQDQ 00, CONSTANT, %xmm1
  129. PCLMULQDQ 00, CONSTANT, %xmm2
  130. PCLMULQDQ 00, CONSTANT, %xmm3
  131. #ifdef __x86_64__
  132. PCLMULQDQ 00, CONSTANT, %xmm4
  133. #endif
  134. PCLMULQDQ 0x11, CONSTANT, %xmm5
  135. PCLMULQDQ 0x11, CONSTANT, %xmm6
  136. PCLMULQDQ 0x11, CONSTANT, %xmm7
  137. #ifdef __x86_64__
  138. PCLMULQDQ 0x11, CONSTANT, %xmm8
  139. #endif
  140. pxor %xmm5, %xmm1
  141. pxor %xmm6, %xmm2
  142. pxor %xmm7, %xmm3
  143. #ifdef __x86_64__
  144. pxor %xmm8, %xmm4
  145. #else
  146. /* xmm8 unsupported for x32 */
  147. movdqa %xmm4, %xmm5
  148. PCLMULQDQ 00, CONSTANT, %xmm4
  149. PCLMULQDQ 0x11, CONSTANT, %xmm5
  150. pxor %xmm5, %xmm4
  151. #endif
  152. pxor (BUF), %xmm1
  153. pxor 0x10(BUF), %xmm2
  154. pxor 0x20(BUF), %xmm3
  155. pxor 0x30(BUF), %xmm4
  156. sub $0x40, LEN
  157. add $0x40, BUF
  158. cmp $0x40, LEN
  159. jge loop_64
  160. less_64:/* Folding cache line into 128bit */
  161. #ifdef __x86_64__
  162. movdqa .Lconstant_R4R3(%rip), CONSTANT
  163. #else
  164. movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
  165. #endif
  166. prefetchnta (BUF)
  167. movdqa %xmm1, %xmm5
  168. PCLMULQDQ 0x00, CONSTANT, %xmm1
  169. PCLMULQDQ 0x11, CONSTANT, %xmm5
  170. pxor %xmm5, %xmm1
  171. pxor %xmm2, %xmm1
  172. movdqa %xmm1, %xmm5
  173. PCLMULQDQ 0x00, CONSTANT, %xmm1
  174. PCLMULQDQ 0x11, CONSTANT, %xmm5
  175. pxor %xmm5, %xmm1
  176. pxor %xmm3, %xmm1
  177. movdqa %xmm1, %xmm5
  178. PCLMULQDQ 0x00, CONSTANT, %xmm1
  179. PCLMULQDQ 0x11, CONSTANT, %xmm5
  180. pxor %xmm5, %xmm1
  181. pxor %xmm4, %xmm1
  182. cmp $0x10, LEN
  183. jb fold_64
  184. loop_16:/* Folding rest buffer into 128bit */
  185. movdqa %xmm1, %xmm5
  186. PCLMULQDQ 0x00, CONSTANT, %xmm1
  187. PCLMULQDQ 0x11, CONSTANT, %xmm5
  188. pxor %xmm5, %xmm1
  189. pxor (BUF), %xmm1
  190. sub $0x10, LEN
  191. add $0x10, BUF
  192. cmp $0x10, LEN
  193. jge loop_16
  194. fold_64:
  195. /* perform the last 64 bit fold, also adds 32 zeroes
  196. * to the input stream */
  197. PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
  198. psrldq $0x08, %xmm1
  199. pxor CONSTANT, %xmm1
  200. /* final 32-bit fold */
  201. movdqa %xmm1, %xmm2
  202. #ifdef __x86_64__
  203. movdqa .Lconstant_R5(%rip), CONSTANT
  204. movdqa .Lconstant_mask32(%rip), %xmm3
  205. #else
  206. movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
  207. movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
  208. #endif
  209. psrldq $0x04, %xmm2
  210. pand %xmm3, %xmm1
  211. PCLMULQDQ 0x00, CONSTANT, %xmm1
  212. pxor %xmm2, %xmm1
  213. /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
  214. #ifdef __x86_64__
  215. movdqa .Lconstant_RUpoly(%rip), CONSTANT
  216. #else
  217. movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
  218. #endif
  219. movdqa %xmm1, %xmm2
  220. pand %xmm3, %xmm1
  221. PCLMULQDQ 0x10, CONSTANT, %xmm1
  222. pand %xmm3, %xmm1
  223. PCLMULQDQ 0x00, CONSTANT, %xmm1
  224. pxor %xmm2, %xmm1
  225. pextrd $0x01, %xmm1, %eax
  226. ret