memcpy.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. /*
  2. * Optimized memory copy routines.
  3. *
  4. * Copyright (C) 2004 Randolph Chung <tausq@debian.org>
  5. * Copyright (C) 2013 Helge Deller <deller@gmx.de>
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2, or (at your option)
  10. * any later version.
  11. *
  12. * This program is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20. *
  21. * Portions derived from the GNU C Library
  22. * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
  23. *
  24. * Several strategies are tried to try to get the best performance for various
  25. * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
  26. * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
  27. * general registers. Unaligned copies are handled either by aligning the
  28. * destination and then using shift-and-write method, or in a few cases by
  29. * falling back to a byte-at-a-time copy.
  30. *
  31. * I chose to implement this in C because it is easier to maintain and debug,
  32. * and in my experiments it appears that the C code generated by gcc (3.3/3.4
  33. * at the time of writing) is fairly optimal. Unfortunately some of the
  34. * semantics of the copy routine (exception handling) is difficult to express
  35. * in C, so we have to play some tricks to get it to work.
  36. *
  37. * All the loads and stores are done via explicit asm() code in order to use
  38. * the right space registers.
  39. *
  40. * Testing with various alignments and buffer sizes shows that this code is
  41. * often >10x faster than a simple byte-at-a-time copy, even for strangely
  42. * aligned operands. It is interesting to note that the glibc version
  43. * of memcpy (written in C) is actually quite fast already. This routine is
  44. * able to beat it by 30-40% for aligned copies because of the loop unrolling,
  45. * but in some cases the glibc version is still slightly faster. This lends
  46. * more credibility that gcc can generate very good code as long as we are
  47. * careful.
  48. *
  49. * TODO:
  50. * - cache prefetching needs more experimentation to get optimal settings
  51. * - try not to use the post-increment address modifiers; they create additional
  52. * interlocks
  53. * - replace byte-copy loops with stybs sequences
  54. */
  55. #ifdef __KERNEL__
  56. #include <linux/module.h>
  57. #include <linux/compiler.h>
  58. #include <asm/uaccess.h>
  59. #define s_space "%%sr1"
  60. #define d_space "%%sr2"
  61. #else
  62. #include "memcpy.h"
  63. #define s_space "%%sr0"
  64. #define d_space "%%sr0"
  65. #define pa_memcpy new2_copy
  66. #endif
  67. DECLARE_PER_CPU(struct exception_data, exception_data);
  68. #define preserve_branch(label) do { \
  69. volatile int dummy = 0; \
  70. /* The following branch is never taken, it's just here to */ \
  71. /* prevent gcc from optimizing away our exception code. */ \
  72. if (unlikely(dummy != dummy)) \
  73. goto label; \
  74. } while (0)
  75. #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
  76. #define get_kernel_space() (0)
  77. #define MERGE(w0, sh_1, w1, sh_2) ({ \
  78. unsigned int _r; \
  79. asm volatile ( \
  80. "mtsar %3\n" \
  81. "shrpw %1, %2, %%sar, %0\n" \
  82. : "=r"(_r) \
  83. : "r"(w0), "r"(w1), "r"(sh_2) \
  84. ); \
  85. _r; \
  86. })
  87. #define THRESHOLD 16
  88. #ifdef DEBUG_MEMCPY
  89. #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
  90. #else
  91. #define DPRINTF(fmt, args...)
  92. #endif
  93. #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
  94. __asm__ __volatile__ ( \
  95. "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \
  96. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  97. : _tt(_t), "+r"(_a) \
  98. : \
  99. : "r8")
  100. #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \
  101. __asm__ __volatile__ ( \
  102. "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \
  103. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  104. : "+r"(_a) \
  105. : _tt(_t) \
  106. : "r8")
  107. #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
  108. #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
  109. #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
  110. #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
  111. #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
  112. #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
  113. #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \
  114. __asm__ __volatile__ ( \
  115. "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \
  116. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  117. : _tt(_t) \
  118. : "r"(_a) \
  119. : "r8")
  120. #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \
  121. __asm__ __volatile__ ( \
  122. "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \
  123. ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \
  124. : \
  125. : _tt(_t), "r"(_a) \
  126. : "r8")
  127. #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
  128. #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)
  129. #ifdef CONFIG_PREFETCH
  130. static inline void prefetch_src(const void *addr)
  131. {
  132. __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
  133. }
  134. static inline void prefetch_dst(const void *addr)
  135. {
  136. __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
  137. }
  138. #else
  139. #define prefetch_src(addr) do { } while(0)
  140. #define prefetch_dst(addr) do { } while(0)
  141. #endif
  142. #define PA_MEMCPY_OK 0
  143. #define PA_MEMCPY_LOAD_ERROR 1
  144. #define PA_MEMCPY_STORE_ERROR 2
  145. /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
  146. * per loop. This code is derived from glibc.
  147. */
  148. static inline unsigned long copy_dstaligned(unsigned long dst,
  149. unsigned long src, unsigned long len)
  150. {
  151. /* gcc complains that a2 and a3 may be uninitialized, but actually
  152. * they cannot be. Initialize a2/a3 to shut gcc up.
  153. */
  154. register unsigned int a0, a1, a2 = 0, a3 = 0;
  155. int sh_1, sh_2;
  156. /* prefetch_src((const void *)src); */
  157. /* Calculate how to shift a word read at the memory operation
  158. aligned srcp to make it aligned for copy. */
  159. sh_1 = 8 * (src % sizeof(unsigned int));
  160. sh_2 = 8 * sizeof(unsigned int) - sh_1;
  161. /* Make src aligned by rounding it down. */
  162. src &= -sizeof(unsigned int);
  163. switch (len % 4)
  164. {
  165. case 2:
  166. /* a1 = ((unsigned int *) src)[0];
  167. a2 = ((unsigned int *) src)[1]; */
  168. ldw(s_space, 0, src, a1, cda_ldw_exc);
  169. ldw(s_space, 4, src, a2, cda_ldw_exc);
  170. src -= 1 * sizeof(unsigned int);
  171. dst -= 3 * sizeof(unsigned int);
  172. len += 2;
  173. goto do1;
  174. case 3:
  175. /* a0 = ((unsigned int *) src)[0];
  176. a1 = ((unsigned int *) src)[1]; */
  177. ldw(s_space, 0, src, a0, cda_ldw_exc);
  178. ldw(s_space, 4, src, a1, cda_ldw_exc);
  179. src -= 0 * sizeof(unsigned int);
  180. dst -= 2 * sizeof(unsigned int);
  181. len += 1;
  182. goto do2;
  183. case 0:
  184. if (len == 0)
  185. return PA_MEMCPY_OK;
  186. /* a3 = ((unsigned int *) src)[0];
  187. a0 = ((unsigned int *) src)[1]; */
  188. ldw(s_space, 0, src, a3, cda_ldw_exc);
  189. ldw(s_space, 4, src, a0, cda_ldw_exc);
  190. src -=-1 * sizeof(unsigned int);
  191. dst -= 1 * sizeof(unsigned int);
  192. len += 0;
  193. goto do3;
  194. case 1:
  195. /* a2 = ((unsigned int *) src)[0];
  196. a3 = ((unsigned int *) src)[1]; */
  197. ldw(s_space, 0, src, a2, cda_ldw_exc);
  198. ldw(s_space, 4, src, a3, cda_ldw_exc);
  199. src -=-2 * sizeof(unsigned int);
  200. dst -= 0 * sizeof(unsigned int);
  201. len -= 1;
  202. if (len == 0)
  203. goto do0;
  204. goto do4; /* No-op. */
  205. }
  206. do
  207. {
  208. /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
  209. do4:
  210. /* a0 = ((unsigned int *) src)[0]; */
  211. ldw(s_space, 0, src, a0, cda_ldw_exc);
  212. /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
  213. stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
  214. do3:
  215. /* a1 = ((unsigned int *) src)[1]; */
  216. ldw(s_space, 4, src, a1, cda_ldw_exc);
  217. /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
  218. stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
  219. do2:
  220. /* a2 = ((unsigned int *) src)[2]; */
  221. ldw(s_space, 8, src, a2, cda_ldw_exc);
  222. /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
  223. stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
  224. do1:
  225. /* a3 = ((unsigned int *) src)[3]; */
  226. ldw(s_space, 12, src, a3, cda_ldw_exc);
  227. /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
  228. stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
  229. src += 4 * sizeof(unsigned int);
  230. dst += 4 * sizeof(unsigned int);
  231. len -= 4;
  232. }
  233. while (len != 0);
  234. do0:
  235. /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
  236. stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
  237. preserve_branch(handle_load_error);
  238. preserve_branch(handle_store_error);
  239. return PA_MEMCPY_OK;
  240. handle_load_error:
  241. __asm__ __volatile__ ("cda_ldw_exc:\n");
  242. return PA_MEMCPY_LOAD_ERROR;
  243. handle_store_error:
  244. __asm__ __volatile__ ("cda_stw_exc:\n");
  245. return PA_MEMCPY_STORE_ERROR;
  246. }
  247. /* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
  248. * In case of an access fault the faulty address can be read from the per_cpu
  249. * exception data struct. */
  250. static unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
  251. unsigned long len)
  252. {
  253. register unsigned long src, dst, t1, t2, t3;
  254. register unsigned char *pcs, *pcd;
  255. register unsigned int *pws, *pwd;
  256. register double *pds, *pdd;
  257. unsigned long ret;
  258. src = (unsigned long)srcp;
  259. dst = (unsigned long)dstp;
  260. pcs = (unsigned char *)srcp;
  261. pcd = (unsigned char *)dstp;
  262. /* prefetch_src((const void *)srcp); */
  263. if (len < THRESHOLD)
  264. goto byte_copy;
  265. /* Check alignment */
  266. t1 = (src ^ dst);
  267. if (unlikely(t1 & (sizeof(double)-1)))
  268. goto unaligned_copy;
  269. /* src and dst have same alignment. */
  270. /* Copy bytes till we are double-aligned. */
  271. t2 = src & (sizeof(double) - 1);
  272. if (unlikely(t2 != 0)) {
  273. t2 = sizeof(double) - t2;
  274. while (t2 && len) {
  275. /* *pcd++ = *pcs++; */
  276. ldbma(s_space, pcs, t3, pmc_load_exc);
  277. len--;
  278. stbma(d_space, t3, pcd, pmc_store_exc);
  279. t2--;
  280. }
  281. }
  282. pds = (double *)pcs;
  283. pdd = (double *)pcd;
  284. #if 0
  285. /* Copy 8 doubles at a time */
  286. while (len >= 8*sizeof(double)) {
  287. register double r1, r2, r3, r4, r5, r6, r7, r8;
  288. /* prefetch_src((char *)pds + L1_CACHE_BYTES); */
  289. flddma(s_space, pds, r1, pmc_load_exc);
  290. flddma(s_space, pds, r2, pmc_load_exc);
  291. flddma(s_space, pds, r3, pmc_load_exc);
  292. flddma(s_space, pds, r4, pmc_load_exc);
  293. fstdma(d_space, r1, pdd, pmc_store_exc);
  294. fstdma(d_space, r2, pdd, pmc_store_exc);
  295. fstdma(d_space, r3, pdd, pmc_store_exc);
  296. fstdma(d_space, r4, pdd, pmc_store_exc);
  297. #if 0
  298. if (L1_CACHE_BYTES <= 32)
  299. prefetch_src((char *)pds + L1_CACHE_BYTES);
  300. #endif
  301. flddma(s_space, pds, r5, pmc_load_exc);
  302. flddma(s_space, pds, r6, pmc_load_exc);
  303. flddma(s_space, pds, r7, pmc_load_exc);
  304. flddma(s_space, pds, r8, pmc_load_exc);
  305. fstdma(d_space, r5, pdd, pmc_store_exc);
  306. fstdma(d_space, r6, pdd, pmc_store_exc);
  307. fstdma(d_space, r7, pdd, pmc_store_exc);
  308. fstdma(d_space, r8, pdd, pmc_store_exc);
  309. len -= 8*sizeof(double);
  310. }
  311. #endif
  312. pws = (unsigned int *)pds;
  313. pwd = (unsigned int *)pdd;
  314. word_copy:
  315. while (len >= 8*sizeof(unsigned int)) {
  316. register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
  317. /* prefetch_src((char *)pws + L1_CACHE_BYTES); */
  318. ldwma(s_space, pws, r1, pmc_load_exc);
  319. ldwma(s_space, pws, r2, pmc_load_exc);
  320. ldwma(s_space, pws, r3, pmc_load_exc);
  321. ldwma(s_space, pws, r4, pmc_load_exc);
  322. stwma(d_space, r1, pwd, pmc_store_exc);
  323. stwma(d_space, r2, pwd, pmc_store_exc);
  324. stwma(d_space, r3, pwd, pmc_store_exc);
  325. stwma(d_space, r4, pwd, pmc_store_exc);
  326. ldwma(s_space, pws, r5, pmc_load_exc);
  327. ldwma(s_space, pws, r6, pmc_load_exc);
  328. ldwma(s_space, pws, r7, pmc_load_exc);
  329. ldwma(s_space, pws, r8, pmc_load_exc);
  330. stwma(d_space, r5, pwd, pmc_store_exc);
  331. stwma(d_space, r6, pwd, pmc_store_exc);
  332. stwma(d_space, r7, pwd, pmc_store_exc);
  333. stwma(d_space, r8, pwd, pmc_store_exc);
  334. len -= 8*sizeof(unsigned int);
  335. }
  336. while (len >= 4*sizeof(unsigned int)) {
  337. register unsigned int r1,r2,r3,r4;
  338. ldwma(s_space, pws, r1, pmc_load_exc);
  339. ldwma(s_space, pws, r2, pmc_load_exc);
  340. ldwma(s_space, pws, r3, pmc_load_exc);
  341. ldwma(s_space, pws, r4, pmc_load_exc);
  342. stwma(d_space, r1, pwd, pmc_store_exc);
  343. stwma(d_space, r2, pwd, pmc_store_exc);
  344. stwma(d_space, r3, pwd, pmc_store_exc);
  345. stwma(d_space, r4, pwd, pmc_store_exc);
  346. len -= 4*sizeof(unsigned int);
  347. }
  348. pcs = (unsigned char *)pws;
  349. pcd = (unsigned char *)pwd;
  350. byte_copy:
  351. while (len) {
  352. /* *pcd++ = *pcs++; */
  353. ldbma(s_space, pcs, t3, pmc_load_exc);
  354. stbma(d_space, t3, pcd, pmc_store_exc);
  355. len--;
  356. }
  357. return PA_MEMCPY_OK;
  358. unaligned_copy:
  359. /* possibly we are aligned on a word, but not on a double... */
  360. if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
  361. t2 = src & (sizeof(unsigned int) - 1);
  362. if (unlikely(t2 != 0)) {
  363. t2 = sizeof(unsigned int) - t2;
  364. while (t2) {
  365. /* *pcd++ = *pcs++; */
  366. ldbma(s_space, pcs, t3, pmc_load_exc);
  367. stbma(d_space, t3, pcd, pmc_store_exc);
  368. len--;
  369. t2--;
  370. }
  371. }
  372. pws = (unsigned int *)pcs;
  373. pwd = (unsigned int *)pcd;
  374. goto word_copy;
  375. }
  376. /* Align the destination. */
  377. if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
  378. t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
  379. while (t2) {
  380. /* *pcd++ = *pcs++; */
  381. ldbma(s_space, pcs, t3, pmc_load_exc);
  382. stbma(d_space, t3, pcd, pmc_store_exc);
  383. len--;
  384. t2--;
  385. }
  386. dst = (unsigned long)pcd;
  387. src = (unsigned long)pcs;
  388. }
  389. ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
  390. if (ret)
  391. return ret;
  392. pcs += (len & -sizeof(unsigned int));
  393. pcd += (len & -sizeof(unsigned int));
  394. len %= sizeof(unsigned int);
  395. preserve_branch(handle_load_error);
  396. preserve_branch(handle_store_error);
  397. goto byte_copy;
  398. handle_load_error:
  399. __asm__ __volatile__ ("pmc_load_exc:\n");
  400. return PA_MEMCPY_LOAD_ERROR;
  401. handle_store_error:
  402. __asm__ __volatile__ ("pmc_store_exc:\n");
  403. return PA_MEMCPY_STORE_ERROR;
  404. }
  405. /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
  406. static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
  407. {
  408. unsigned long ret, fault_addr, reference;
  409. struct exception_data *d;
  410. ret = pa_memcpy_internal(dstp, srcp, len);
  411. if (likely(ret == PA_MEMCPY_OK))
  412. return 0;
  413. /* if a load or store fault occured we can get the faulty addr */
  414. d = &__get_cpu_var(exception_data);
  415. fault_addr = d->fault_addr;
  416. /* error in load or store? */
  417. if (ret == PA_MEMCPY_LOAD_ERROR)
  418. reference = (unsigned long) srcp;
  419. else
  420. reference = (unsigned long) dstp;
  421. DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n",
  422. ret, len, fault_addr, reference);
  423. if (fault_addr >= reference)
  424. return len - (fault_addr - reference);
  425. else
  426. return len;
  427. }
  428. #ifdef __KERNEL__
  429. unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
  430. {
  431. mtsp(get_kernel_space(), 1);
  432. mtsp(get_user_space(), 2);
  433. return pa_memcpy((void __force *)dst, src, len);
  434. }
  435. EXPORT_SYMBOL(__copy_from_user);
  436. unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
  437. {
  438. mtsp(get_user_space(), 1);
  439. mtsp(get_kernel_space(), 2);
  440. return pa_memcpy(dst, (void __force *)src, len);
  441. }
  442. unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
  443. {
  444. mtsp(get_user_space(), 1);
  445. mtsp(get_user_space(), 2);
  446. return pa_memcpy((void __force *)dst, (void __force *)src, len);
  447. }
  448. void * memcpy(void * dst,const void *src, size_t count)
  449. {
  450. mtsp(get_kernel_space(), 1);
  451. mtsp(get_kernel_space(), 2);
  452. pa_memcpy(dst, src, count);
  453. return dst;
  454. }
  455. EXPORT_SYMBOL(copy_to_user);
  456. EXPORT_SYMBOL(copy_from_user);
  457. EXPORT_SYMBOL(copy_in_user);
  458. EXPORT_SYMBOL(memcpy);
  459. #endif