xor.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. #ifdef CONFIG_KMEMCHECK
  2. /* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
  3. # include <asm-generic/xor.h>
  4. #elif !defined(_ASM_X86_XOR_H)
  5. #define _ASM_X86_XOR_H
  6. /*
  7. * Optimized RAID-5 checksumming functions for SSE.
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2, or (at your option)
  12. * any later version.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * (for example /usr/src/linux/COPYING); if not, write to the Free
  16. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. */
  18. /*
  19. * Cache avoiding checksumming functions utilizing KNI instructions
  20. * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  21. */
  22. /*
  23. * Based on
  24. * High-speed RAID5 checksumming functions utilizing SSE instructions.
  25. * Copyright (C) 1998 Ingo Molnar.
  26. */
  27. /*
  28. * x86-64 changes / gcc fixes from Andi Kleen.
  29. * Copyright 2002 Andi Kleen, SuSE Labs.
  30. *
  31. * This hasn't been optimized for the hammer yet, but there are likely
  32. * no advantages to be gotten from x86-64 here anyways.
  33. */
  34. #include <asm/i387.h>
  35. #ifdef CONFIG_X86_32
  36. /* reduce register pressure */
  37. # define XOR_CONSTANT_CONSTRAINT "i"
  38. #else
  39. # define XOR_CONSTANT_CONSTRAINT "re"
  40. #endif
  41. #define OFFS(x) "16*("#x")"
  42. #define PF_OFFS(x) "256+16*("#x")"
  43. #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
  44. #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
  45. #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
  46. #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
  47. #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
  48. #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
  49. #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
  50. #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
  51. #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
  52. #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
  53. #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
  54. #define NOP(x)
  55. #define BLK64(pf, op, i) \
  56. pf(i) \
  57. op(i, 0) \
  58. op(i + 1, 1) \
  59. op(i + 2, 2) \
  60. op(i + 3, 3)
  61. static void
  62. xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  63. {
  64. unsigned long lines = bytes >> 8;
  65. kernel_fpu_begin();
  66. asm volatile(
  67. #undef BLOCK
  68. #define BLOCK(i) \
  69. LD(i, 0) \
  70. LD(i + 1, 1) \
  71. PF1(i) \
  72. PF1(i + 2) \
  73. LD(i + 2, 2) \
  74. LD(i + 3, 3) \
  75. PF0(i + 4) \
  76. PF0(i + 6) \
  77. XO1(i, 0) \
  78. XO1(i + 1, 1) \
  79. XO1(i + 2, 2) \
  80. XO1(i + 3, 3) \
  81. ST(i, 0) \
  82. ST(i + 1, 1) \
  83. ST(i + 2, 2) \
  84. ST(i + 3, 3) \
  85. PF0(0)
  86. PF0(2)
  87. " .align 32 ;\n"
  88. " 1: ;\n"
  89. BLOCK(0)
  90. BLOCK(4)
  91. BLOCK(8)
  92. BLOCK(12)
  93. " add %[inc], %[p1] ;\n"
  94. " add %[inc], %[p2] ;\n"
  95. " dec %[cnt] ;\n"
  96. " jnz 1b ;\n"
  97. : [cnt] "+r" (lines),
  98. [p1] "+r" (p1), [p2] "+r" (p2)
  99. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  100. : "memory");
  101. kernel_fpu_end();
  102. }
  103. static void
  104. xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  105. {
  106. unsigned long lines = bytes >> 8;
  107. kernel_fpu_begin();
  108. asm volatile(
  109. #undef BLOCK
  110. #define BLOCK(i) \
  111. BLK64(PF0, LD, i) \
  112. BLK64(PF1, XO1, i) \
  113. BLK64(NOP, ST, i) \
  114. " .align 32 ;\n"
  115. " 1: ;\n"
  116. BLOCK(0)
  117. BLOCK(4)
  118. BLOCK(8)
  119. BLOCK(12)
  120. " add %[inc], %[p1] ;\n"
  121. " add %[inc], %[p2] ;\n"
  122. " dec %[cnt] ;\n"
  123. " jnz 1b ;\n"
  124. : [cnt] "+r" (lines),
  125. [p1] "+r" (p1), [p2] "+r" (p2)
  126. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  127. : "memory");
  128. kernel_fpu_end();
  129. }
  130. static void
  131. xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  132. unsigned long *p3)
  133. {
  134. unsigned long lines = bytes >> 8;
  135. kernel_fpu_begin();
  136. asm volatile(
  137. #undef BLOCK
  138. #define BLOCK(i) \
  139. PF1(i) \
  140. PF1(i + 2) \
  141. LD(i, 0) \
  142. LD(i + 1, 1) \
  143. LD(i + 2, 2) \
  144. LD(i + 3, 3) \
  145. PF2(i) \
  146. PF2(i + 2) \
  147. PF0(i + 4) \
  148. PF0(i + 6) \
  149. XO1(i, 0) \
  150. XO1(i + 1, 1) \
  151. XO1(i + 2, 2) \
  152. XO1(i + 3, 3) \
  153. XO2(i, 0) \
  154. XO2(i + 1, 1) \
  155. XO2(i + 2, 2) \
  156. XO2(i + 3, 3) \
  157. ST(i, 0) \
  158. ST(i + 1, 1) \
  159. ST(i + 2, 2) \
  160. ST(i + 3, 3) \
  161. PF0(0)
  162. PF0(2)
  163. " .align 32 ;\n"
  164. " 1: ;\n"
  165. BLOCK(0)
  166. BLOCK(4)
  167. BLOCK(8)
  168. BLOCK(12)
  169. " add %[inc], %[p1] ;\n"
  170. " add %[inc], %[p2] ;\n"
  171. " add %[inc], %[p3] ;\n"
  172. " dec %[cnt] ;\n"
  173. " jnz 1b ;\n"
  174. : [cnt] "+r" (lines),
  175. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  176. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  177. : "memory");
  178. kernel_fpu_end();
  179. }
  180. static void
  181. xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  182. unsigned long *p3)
  183. {
  184. unsigned long lines = bytes >> 8;
  185. kernel_fpu_begin();
  186. asm volatile(
  187. #undef BLOCK
  188. #define BLOCK(i) \
  189. BLK64(PF0, LD, i) \
  190. BLK64(PF1, XO1, i) \
  191. BLK64(PF2, XO2, i) \
  192. BLK64(NOP, ST, i) \
  193. " .align 32 ;\n"
  194. " 1: ;\n"
  195. BLOCK(0)
  196. BLOCK(4)
  197. BLOCK(8)
  198. BLOCK(12)
  199. " add %[inc], %[p1] ;\n"
  200. " add %[inc], %[p2] ;\n"
  201. " add %[inc], %[p3] ;\n"
  202. " dec %[cnt] ;\n"
  203. " jnz 1b ;\n"
  204. : [cnt] "+r" (lines),
  205. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  206. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  207. : "memory");
  208. kernel_fpu_end();
  209. }
  210. static void
  211. xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  212. unsigned long *p3, unsigned long *p4)
  213. {
  214. unsigned long lines = bytes >> 8;
  215. kernel_fpu_begin();
  216. asm volatile(
  217. #undef BLOCK
  218. #define BLOCK(i) \
  219. PF1(i) \
  220. PF1(i + 2) \
  221. LD(i, 0) \
  222. LD(i + 1, 1) \
  223. LD(i + 2, 2) \
  224. LD(i + 3, 3) \
  225. PF2(i) \
  226. PF2(i + 2) \
  227. XO1(i, 0) \
  228. XO1(i + 1, 1) \
  229. XO1(i + 2, 2) \
  230. XO1(i + 3, 3) \
  231. PF3(i) \
  232. PF3(i + 2) \
  233. PF0(i + 4) \
  234. PF0(i + 6) \
  235. XO2(i, 0) \
  236. XO2(i + 1, 1) \
  237. XO2(i + 2, 2) \
  238. XO2(i + 3, 3) \
  239. XO3(i, 0) \
  240. XO3(i + 1, 1) \
  241. XO3(i + 2, 2) \
  242. XO3(i + 3, 3) \
  243. ST(i, 0) \
  244. ST(i + 1, 1) \
  245. ST(i + 2, 2) \
  246. ST(i + 3, 3) \
  247. PF0(0)
  248. PF0(2)
  249. " .align 32 ;\n"
  250. " 1: ;\n"
  251. BLOCK(0)
  252. BLOCK(4)
  253. BLOCK(8)
  254. BLOCK(12)
  255. " add %[inc], %[p1] ;\n"
  256. " add %[inc], %[p2] ;\n"
  257. " add %[inc], %[p3] ;\n"
  258. " add %[inc], %[p4] ;\n"
  259. " dec %[cnt] ;\n"
  260. " jnz 1b ;\n"
  261. : [cnt] "+r" (lines), [p1] "+r" (p1),
  262. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  263. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  264. : "memory");
  265. kernel_fpu_end();
  266. }
  267. static void
  268. xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  269. unsigned long *p3, unsigned long *p4)
  270. {
  271. unsigned long lines = bytes >> 8;
  272. kernel_fpu_begin();
  273. asm volatile(
  274. #undef BLOCK
  275. #define BLOCK(i) \
  276. BLK64(PF0, LD, i) \
  277. BLK64(PF1, XO1, i) \
  278. BLK64(PF2, XO2, i) \
  279. BLK64(PF3, XO3, i) \
  280. BLK64(NOP, ST, i) \
  281. " .align 32 ;\n"
  282. " 1: ;\n"
  283. BLOCK(0)
  284. BLOCK(4)
  285. BLOCK(8)
  286. BLOCK(12)
  287. " add %[inc], %[p1] ;\n"
  288. " add %[inc], %[p2] ;\n"
  289. " add %[inc], %[p3] ;\n"
  290. " add %[inc], %[p4] ;\n"
  291. " dec %[cnt] ;\n"
  292. " jnz 1b ;\n"
  293. : [cnt] "+r" (lines), [p1] "+r" (p1),
  294. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  295. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  296. : "memory");
  297. kernel_fpu_end();
  298. }
  299. static void
  300. xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  301. unsigned long *p3, unsigned long *p4, unsigned long *p5)
  302. {
  303. unsigned long lines = bytes >> 8;
  304. kernel_fpu_begin();
  305. asm volatile(
  306. #undef BLOCK
  307. #define BLOCK(i) \
  308. PF1(i) \
  309. PF1(i + 2) \
  310. LD(i, 0) \
  311. LD(i + 1, 1) \
  312. LD(i + 2, 2) \
  313. LD(i + 3, 3) \
  314. PF2(i) \
  315. PF2(i + 2) \
  316. XO1(i, 0) \
  317. XO1(i + 1, 1) \
  318. XO1(i + 2, 2) \
  319. XO1(i + 3, 3) \
  320. PF3(i) \
  321. PF3(i + 2) \
  322. XO2(i, 0) \
  323. XO2(i + 1, 1) \
  324. XO2(i + 2, 2) \
  325. XO2(i + 3, 3) \
  326. PF4(i) \
  327. PF4(i + 2) \
  328. PF0(i + 4) \
  329. PF0(i + 6) \
  330. XO3(i, 0) \
  331. XO3(i + 1, 1) \
  332. XO3(i + 2, 2) \
  333. XO3(i + 3, 3) \
  334. XO4(i, 0) \
  335. XO4(i + 1, 1) \
  336. XO4(i + 2, 2) \
  337. XO4(i + 3, 3) \
  338. ST(i, 0) \
  339. ST(i + 1, 1) \
  340. ST(i + 2, 2) \
  341. ST(i + 3, 3) \
  342. PF0(0)
  343. PF0(2)
  344. " .align 32 ;\n"
  345. " 1: ;\n"
  346. BLOCK(0)
  347. BLOCK(4)
  348. BLOCK(8)
  349. BLOCK(12)
  350. " add %[inc], %[p1] ;\n"
  351. " add %[inc], %[p2] ;\n"
  352. " add %[inc], %[p3] ;\n"
  353. " add %[inc], %[p4] ;\n"
  354. " add %[inc], %[p5] ;\n"
  355. " dec %[cnt] ;\n"
  356. " jnz 1b ;\n"
  357. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  358. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  359. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  360. : "memory");
  361. kernel_fpu_end();
  362. }
  363. static void
  364. xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  365. unsigned long *p3, unsigned long *p4, unsigned long *p5)
  366. {
  367. unsigned long lines = bytes >> 8;
  368. kernel_fpu_begin();
  369. asm volatile(
  370. #undef BLOCK
  371. #define BLOCK(i) \
  372. BLK64(PF0, LD, i) \
  373. BLK64(PF1, XO1, i) \
  374. BLK64(PF2, XO2, i) \
  375. BLK64(PF3, XO3, i) \
  376. BLK64(PF4, XO4, i) \
  377. BLK64(NOP, ST, i) \
  378. " .align 32 ;\n"
  379. " 1: ;\n"
  380. BLOCK(0)
  381. BLOCK(4)
  382. BLOCK(8)
  383. BLOCK(12)
  384. " add %[inc], %[p1] ;\n"
  385. " add %[inc], %[p2] ;\n"
  386. " add %[inc], %[p3] ;\n"
  387. " add %[inc], %[p4] ;\n"
  388. " add %[inc], %[p5] ;\n"
  389. " dec %[cnt] ;\n"
  390. " jnz 1b ;\n"
  391. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  392. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  393. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  394. : "memory");
  395. kernel_fpu_end();
  396. }
  397. static struct xor_block_template xor_block_sse_pf64 = {
  398. .name = "prefetch64-sse",
  399. .do_2 = xor_sse_2_pf64,
  400. .do_3 = xor_sse_3_pf64,
  401. .do_4 = xor_sse_4_pf64,
  402. .do_5 = xor_sse_5_pf64,
  403. };
  404. #undef LD
  405. #undef XO1
  406. #undef XO2
  407. #undef XO3
  408. #undef XO4
  409. #undef ST
  410. #undef NOP
  411. #undef BLK64
  412. #undef BLOCK
  413. #undef XOR_CONSTANT_CONSTRAINT
  414. #ifdef CONFIG_X86_32
  415. # include <asm/xor_32.h>
  416. #else
  417. # include <asm/xor_64.h>
  418. #endif
  419. #define XOR_SELECT_TEMPLATE(FASTEST) \
  420. AVX_SELECT(FASTEST)
  421. #endif /* _ASM_X86_XOR_H */