xor.h 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #ifdef CONFIG_KMEMCHECK
  2. /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
  3. # include <asm-generic/xor.h>
  4. #elif !defined(_ASM_X86_XOR_H)
  5. #define _ASM_X86_XOR_H
  6. /*
  7. * Optimized RAID-5 checksumming functions for SSE.
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2, or (at your option)
  12. * any later version.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * (for example /usr/src/linux/COPYING); if not, write to the Free
  16. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. */
  18. /*
  19. * Cache avoiding checksumming functions utilizing KNI instructions
  20. * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  21. */
  22. /*
  23. * Based on
  24. * High-speed RAID5 checksumming functions utilizing SSE instructions.
  25. * Copyright (C) 1998 Ingo Molnar.
  26. */
  27. /*
  28. * x86-64 changes / gcc fixes from Andi Kleen.
  29. * Copyright 2002 Andi Kleen, SuSE Labs.
  30. *
  31. * This hasn't been optimized for the hammer yet, but there are likely
  32. * no advantages to be gotten from x86-64 here anyways.
  33. */
  34. #include <asm/i387.h>
  35. #ifdef CONFIG_X86_32
  36. /* reduce register pressure */
  37. # define XOR_CONSTANT_CONSTRAINT "i"
  38. #else
  39. # define XOR_CONSTANT_CONSTRAINT "re"
  40. #endif
  41. #define OFFS(x) "16*("#x")"
  42. #define PF_OFFS(x) "256+16*("#x")"
  43. #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
  44. #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
  45. #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
  46. #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
  47. #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
  48. #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
  49. #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
  50. #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
  51. #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
  52. #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
  53. #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
  54. static void
  55. xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  56. {
  57. unsigned long lines = bytes >> 8;
  58. kernel_fpu_begin();
  59. asm volatile(
  60. #undef BLOCK
  61. #define BLOCK(i) \
  62. LD(i, 0) \
  63. LD(i + 1, 1) \
  64. PF1(i) \
  65. PF1(i + 2) \
  66. LD(i + 2, 2) \
  67. LD(i + 3, 3) \
  68. PF0(i + 4) \
  69. PF0(i + 6) \
  70. XO1(i, 0) \
  71. XO1(i + 1, 1) \
  72. XO1(i + 2, 2) \
  73. XO1(i + 3, 3) \
  74. ST(i, 0) \
  75. ST(i + 1, 1) \
  76. ST(i + 2, 2) \
  77. ST(i + 3, 3) \
  78. PF0(0)
  79. PF0(2)
  80. " .align 32 ;\n"
  81. " 1: ;\n"
  82. BLOCK(0)
  83. BLOCK(4)
  84. BLOCK(8)
  85. BLOCK(12)
  86. " add %[inc], %[p1] ;\n"
  87. " add %[inc], %[p2] ;\n"
  88. " dec %[cnt] ;\n"
  89. " jnz 1b ;\n"
  90. : [cnt] "+r" (lines),
  91. [p1] "+r" (p1), [p2] "+r" (p2)
  92. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  93. : "memory");
  94. kernel_fpu_end();
  95. }
  96. static void
  97. xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  98. unsigned long *p3)
  99. {
  100. unsigned long lines = bytes >> 8;
  101. kernel_fpu_begin();
  102. asm volatile(
  103. #undef BLOCK
  104. #define BLOCK(i) \
  105. PF1(i) \
  106. PF1(i + 2) \
  107. LD(i, 0) \
  108. LD(i + 1, 1) \
  109. LD(i + 2, 2) \
  110. LD(i + 3, 3) \
  111. PF2(i) \
  112. PF2(i + 2) \
  113. PF0(i + 4) \
  114. PF0(i + 6) \
  115. XO1(i, 0) \
  116. XO1(i + 1, 1) \
  117. XO1(i + 2, 2) \
  118. XO1(i + 3, 3) \
  119. XO2(i, 0) \
  120. XO2(i + 1, 1) \
  121. XO2(i + 2, 2) \
  122. XO2(i + 3, 3) \
  123. ST(i, 0) \
  124. ST(i + 1, 1) \
  125. ST(i + 2, 2) \
  126. ST(i + 3, 3) \
  127. PF0(0)
  128. PF0(2)
  129. " .align 32 ;\n"
  130. " 1: ;\n"
  131. BLOCK(0)
  132. BLOCK(4)
  133. BLOCK(8)
  134. BLOCK(12)
  135. " add %[inc], %[p1] ;\n"
  136. " add %[inc], %[p2] ;\n"
  137. " add %[inc], %[p3] ;\n"
  138. " dec %[cnt] ;\n"
  139. " jnz 1b ;\n"
  140. : [cnt] "+r" (lines),
  141. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  142. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  143. : "memory");
  144. kernel_fpu_end();
  145. }
  146. static void
  147. xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  148. unsigned long *p3, unsigned long *p4)
  149. {
  150. unsigned long lines = bytes >> 8;
  151. kernel_fpu_begin();
  152. asm volatile(
  153. #undef BLOCK
  154. #define BLOCK(i) \
  155. PF1(i) \
  156. PF1(i + 2) \
  157. LD(i, 0) \
  158. LD(i + 1, 1) \
  159. LD(i + 2, 2) \
  160. LD(i + 3, 3) \
  161. PF2(i) \
  162. PF2(i + 2) \
  163. XO1(i, 0) \
  164. XO1(i + 1, 1) \
  165. XO1(i + 2, 2) \
  166. XO1(i + 3, 3) \
  167. PF3(i) \
  168. PF3(i + 2) \
  169. PF0(i + 4) \
  170. PF0(i + 6) \
  171. XO2(i, 0) \
  172. XO2(i + 1, 1) \
  173. XO2(i + 2, 2) \
  174. XO2(i + 3, 3) \
  175. XO3(i, 0) \
  176. XO3(i + 1, 1) \
  177. XO3(i + 2, 2) \
  178. XO3(i + 3, 3) \
  179. ST(i, 0) \
  180. ST(i + 1, 1) \
  181. ST(i + 2, 2) \
  182. ST(i + 3, 3) \
  183. PF0(0)
  184. PF0(2)
  185. " .align 32 ;\n"
  186. " 1: ;\n"
  187. BLOCK(0)
  188. BLOCK(4)
  189. BLOCK(8)
  190. BLOCK(12)
  191. " add %[inc], %[p1] ;\n"
  192. " add %[inc], %[p2] ;\n"
  193. " add %[inc], %[p3] ;\n"
  194. " add %[inc], %[p4] ;\n"
  195. " dec %[cnt] ;\n"
  196. " jnz 1b ;\n"
  197. : [cnt] "+r" (lines), [p1] "+r" (p1),
  198. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  199. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  200. : "memory");
  201. kernel_fpu_end();
  202. }
  203. static void
  204. xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  205. unsigned long *p3, unsigned long *p4, unsigned long *p5)
  206. {
  207. unsigned long lines = bytes >> 8;
  208. kernel_fpu_begin();
  209. asm volatile(
  210. #undef BLOCK
  211. #define BLOCK(i) \
  212. PF1(i) \
  213. PF1(i + 2) \
  214. LD(i, 0) \
  215. LD(i + 1, 1) \
  216. LD(i + 2, 2) \
  217. LD(i + 3, 3) \
  218. PF2(i) \
  219. PF2(i + 2) \
  220. XO1(i, 0) \
  221. XO1(i + 1, 1) \
  222. XO1(i + 2, 2) \
  223. XO1(i + 3, 3) \
  224. PF3(i) \
  225. PF3(i + 2) \
  226. XO2(i, 0) \
  227. XO2(i + 1, 1) \
  228. XO2(i + 2, 2) \
  229. XO2(i + 3, 3) \
  230. PF4(i) \
  231. PF4(i + 2) \
  232. PF0(i + 4) \
  233. PF0(i + 6) \
  234. XO3(i, 0) \
  235. XO3(i + 1, 1) \
  236. XO3(i + 2, 2) \
  237. XO3(i + 3, 3) \
  238. XO4(i, 0) \
  239. XO4(i + 1, 1) \
  240. XO4(i + 2, 2) \
  241. XO4(i + 3, 3) \
  242. ST(i, 0) \
  243. ST(i + 1, 1) \
  244. ST(i + 2, 2) \
  245. ST(i + 3, 3) \
  246. PF0(0)
  247. PF0(2)
  248. " .align 32 ;\n"
  249. " 1: ;\n"
  250. BLOCK(0)
  251. BLOCK(4)
  252. BLOCK(8)
  253. BLOCK(12)
  254. " add %[inc], %[p1] ;\n"
  255. " add %[inc], %[p2] ;\n"
  256. " add %[inc], %[p3] ;\n"
  257. " add %[inc], %[p4] ;\n"
  258. " add %[inc], %[p5] ;\n"
  259. " dec %[cnt] ;\n"
  260. " jnz 1b ;\n"
  261. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  262. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  263. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  264. : "memory");
  265. kernel_fpu_end();
  266. }
  267. #undef LD
  268. #undef XO1
  269. #undef XO2
  270. #undef XO3
  271. #undef XO4
  272. #undef ST
  273. #undef BLOCK
  274. #undef XOR_CONSTANT_CONSTRAINT
  275. #ifdef CONFIG_X86_32
  276. # include <asm/xor_32.h>
  277. #else
  278. # include <asm/xor_64.h>
  279. #endif
  280. #endif /* _ASM_X86_XOR_H */