copy_32.S 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. /*
  2. * Memory copy functions for 32-bit PowerPC.
  3. *
  4. * Copyright (C) 1996-2005 Paul Mackerras.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. */
  11. #include <asm/processor.h>
  12. #include <asm/cache.h>
  13. #include <asm/errno.h>
  14. #include <asm/ppc_asm.h>
  15. #define COPY_16_BYTES \
  16. lwz r7,4(r4); \
  17. lwz r8,8(r4); \
  18. lwz r9,12(r4); \
  19. lwzu r10,16(r4); \
  20. stw r7,4(r6); \
  21. stw r8,8(r6); \
  22. stw r9,12(r6); \
  23. stwu r10,16(r6)
  24. #define COPY_16_BYTES_WITHEX(n) \
  25. 8 ## n ## 0: \
  26. lwz r7,4(r4); \
  27. 8 ## n ## 1: \
  28. lwz r8,8(r4); \
  29. 8 ## n ## 2: \
  30. lwz r9,12(r4); \
  31. 8 ## n ## 3: \
  32. lwzu r10,16(r4); \
  33. 8 ## n ## 4: \
  34. stw r7,4(r6); \
  35. 8 ## n ## 5: \
  36. stw r8,8(r6); \
  37. 8 ## n ## 6: \
  38. stw r9,12(r6); \
  39. 8 ## n ## 7: \
  40. stwu r10,16(r6)
  41. #define COPY_16_BYTES_EXCODE(n) \
  42. 9 ## n ## 0: \
  43. addi r5,r5,-(16 * n); \
  44. b 104f; \
  45. 9 ## n ## 1: \
  46. addi r5,r5,-(16 * n); \
  47. b 105f; \
  48. .section __ex_table,"a"; \
  49. .align 2; \
  50. .long 8 ## n ## 0b,9 ## n ## 0b; \
  51. .long 8 ## n ## 1b,9 ## n ## 0b; \
  52. .long 8 ## n ## 2b,9 ## n ## 0b; \
  53. .long 8 ## n ## 3b,9 ## n ## 0b; \
  54. .long 8 ## n ## 4b,9 ## n ## 1b; \
  55. .long 8 ## n ## 5b,9 ## n ## 1b; \
  56. .long 8 ## n ## 6b,9 ## n ## 1b; \
  57. .long 8 ## n ## 7b,9 ## n ## 1b; \
  58. .text
  59. .text
  60. .stabs "arch/powerpc/lib/",N_SO,0,0,0f
  61. .stabs "copy32.S",N_SO,0,0,0f
  62. 0:
  63. CACHELINE_BYTES = L1_CACHE_BYTES
  64. LG_CACHELINE_BYTES = L1_CACHE_SHIFT
  65. CACHELINE_MASK = (L1_CACHE_BYTES-1)
  66. /*
  67. * Use dcbz on the complete cache lines in the destination
  68. * to set them to zero. This requires that the destination
  69. * area is cacheable. -- paulus
  70. */
  71. _GLOBAL(cacheable_memzero)
  72. mr r5,r4
  73. li r4,0
  74. addi r6,r3,-4
  75. cmplwi 0,r5,4
  76. blt 7f
  77. stwu r4,4(r6)
  78. beqlr
  79. andi. r0,r6,3
  80. add r5,r0,r5
  81. subf r6,r0,r6
  82. clrlwi r7,r6,32-LG_CACHELINE_BYTES
  83. add r8,r7,r5
  84. srwi r9,r8,LG_CACHELINE_BYTES
  85. addic. r9,r9,-1 /* total number of complete cachelines */
  86. ble 2f
  87. xori r0,r7,CACHELINE_MASK & ~3
  88. srwi. r0,r0,2
  89. beq 3f
  90. mtctr r0
  91. 4: stwu r4,4(r6)
  92. bdnz 4b
  93. 3: mtctr r9
  94. li r7,4
  95. #if !defined(CONFIG_8xx)
  96. 10: dcbz r7,r6
  97. #else
  98. 10: stw r4, 4(r6)
  99. stw r4, 8(r6)
  100. stw r4, 12(r6)
  101. stw r4, 16(r6)
  102. #if CACHE_LINE_SIZE >= 32
  103. stw r4, 20(r6)
  104. stw r4, 24(r6)
  105. stw r4, 28(r6)
  106. stw r4, 32(r6)
  107. #endif /* CACHE_LINE_SIZE */
  108. #endif
  109. addi r6,r6,CACHELINE_BYTES
  110. bdnz 10b
  111. clrlwi r5,r8,32-LG_CACHELINE_BYTES
  112. addi r5,r5,4
  113. 2: srwi r0,r5,2
  114. mtctr r0
  115. bdz 6f
  116. 1: stwu r4,4(r6)
  117. bdnz 1b
  118. 6: andi. r5,r5,3
  119. 7: cmpwi 0,r5,0
  120. beqlr
  121. mtctr r5
  122. addi r6,r6,3
  123. 8: stbu r4,1(r6)
  124. bdnz 8b
  125. blr
  126. _GLOBAL(memset)
  127. rlwimi r4,r4,8,16,23
  128. rlwimi r4,r4,16,0,15
  129. addi r6,r3,-4
  130. cmplwi 0,r5,4
  131. blt 7f
  132. stwu r4,4(r6)
  133. beqlr
  134. andi. r0,r6,3
  135. add r5,r0,r5
  136. subf r6,r0,r6
  137. srwi r0,r5,2
  138. mtctr r0
  139. bdz 6f
  140. 1: stwu r4,4(r6)
  141. bdnz 1b
  142. 6: andi. r5,r5,3
  143. 7: cmpwi 0,r5,0
  144. beqlr
  145. mtctr r5
  146. addi r6,r6,3
  147. 8: stbu r4,1(r6)
  148. bdnz 8b
  149. blr
  150. /*
  151. * This version uses dcbz on the complete cache lines in the
  152. * destination area to reduce memory traffic. This requires that
  153. * the destination area is cacheable.
  154. * We only use this version if the source and dest don't overlap.
  155. * -- paulus.
  156. */
  157. _GLOBAL(cacheable_memcpy)
  158. add r7,r3,r5 /* test if the src & dst overlap */
  159. add r8,r4,r5
  160. cmplw 0,r4,r7
  161. cmplw 1,r3,r8
  162. crand 0,0,4 /* cr0.lt &= cr1.lt */
  163. blt memcpy /* if regions overlap */
  164. addi r4,r4,-4
  165. addi r6,r3,-4
  166. neg r0,r3
  167. andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
  168. beq 58f
  169. cmplw 0,r5,r0 /* is this more than total to do? */
  170. blt 63f /* if not much to do */
  171. andi. r8,r0,3 /* get it word-aligned first */
  172. subf r5,r0,r5
  173. mtctr r8
  174. beq+ 61f
  175. 70: lbz r9,4(r4) /* do some bytes */
  176. stb r9,4(r6)
  177. addi r4,r4,1
  178. addi r6,r6,1
  179. bdnz 70b
  180. 61: srwi. r0,r0,2
  181. mtctr r0
  182. beq 58f
  183. 72: lwzu r9,4(r4) /* do some words */
  184. stwu r9,4(r6)
  185. bdnz 72b
  186. 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
  187. clrlwi r5,r5,32-LG_CACHELINE_BYTES
  188. li r11,4
  189. mtctr r0
  190. beq 63f
  191. 53:
  192. #if !defined(CONFIG_8xx)
  193. dcbz r11,r6
  194. #endif
  195. COPY_16_BYTES
  196. #if L1_CACHE_BYTES >= 32
  197. COPY_16_BYTES
  198. #if L1_CACHE_BYTES >= 64
  199. COPY_16_BYTES
  200. COPY_16_BYTES
  201. #if L1_CACHE_BYTES >= 128
  202. COPY_16_BYTES
  203. COPY_16_BYTES
  204. COPY_16_BYTES
  205. COPY_16_BYTES
  206. #endif
  207. #endif
  208. #endif
  209. bdnz 53b
  210. 63: srwi. r0,r5,2
  211. mtctr r0
  212. beq 64f
  213. 30: lwzu r0,4(r4)
  214. stwu r0,4(r6)
  215. bdnz 30b
  216. 64: andi. r0,r5,3
  217. mtctr r0
  218. beq+ 65f
  219. 40: lbz r0,4(r4)
  220. stb r0,4(r6)
  221. addi r4,r4,1
  222. addi r6,r6,1
  223. bdnz 40b
  224. 65: blr
  225. _GLOBAL(memmove)
  226. cmplw 0,r3,r4
  227. bgt backwards_memcpy
  228. /* fall through */
  229. _GLOBAL(memcpy)
  230. srwi. r7,r5,3
  231. addi r6,r3,-4
  232. addi r4,r4,-4
  233. beq 2f /* if less than 8 bytes to do */
  234. andi. r0,r6,3 /* get dest word aligned */
  235. mtctr r7
  236. bne 5f
  237. 1: lwz r7,4(r4)
  238. lwzu r8,8(r4)
  239. stw r7,4(r6)
  240. stwu r8,8(r6)
  241. bdnz 1b
  242. andi. r5,r5,7
  243. 2: cmplwi 0,r5,4
  244. blt 3f
  245. lwzu r0,4(r4)
  246. addi r5,r5,-4
  247. stwu r0,4(r6)
  248. 3: cmpwi 0,r5,0
  249. beqlr
  250. mtctr r5
  251. addi r4,r4,3
  252. addi r6,r6,3
  253. 4: lbzu r0,1(r4)
  254. stbu r0,1(r6)
  255. bdnz 4b
  256. blr
  257. 5: subfic r0,r0,4
  258. mtctr r0
  259. 6: lbz r7,4(r4)
  260. addi r4,r4,1
  261. stb r7,4(r6)
  262. addi r6,r6,1
  263. bdnz 6b
  264. subf r5,r0,r5
  265. rlwinm. r7,r5,32-3,3,31
  266. beq 2b
  267. mtctr r7
  268. b 1b
  269. _GLOBAL(backwards_memcpy)
  270. rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
  271. add r6,r3,r5
  272. add r4,r4,r5
  273. beq 2f
  274. andi. r0,r6,3
  275. mtctr r7
  276. bne 5f
  277. 1: lwz r7,-4(r4)
  278. lwzu r8,-8(r4)
  279. stw r7,-4(r6)
  280. stwu r8,-8(r6)
  281. bdnz 1b
  282. andi. r5,r5,7
  283. 2: cmplwi 0,r5,4
  284. blt 3f
  285. lwzu r0,-4(r4)
  286. subi r5,r5,4
  287. stwu r0,-4(r6)
  288. 3: cmpwi 0,r5,0
  289. beqlr
  290. mtctr r5
  291. 4: lbzu r0,-1(r4)
  292. stbu r0,-1(r6)
  293. bdnz 4b
  294. blr
  295. 5: mtctr r0
  296. 6: lbzu r7,-1(r4)
  297. stbu r7,-1(r6)
  298. bdnz 6b
  299. subf r5,r0,r5
  300. rlwinm. r7,r5,32-3,3,31
  301. beq 2b
  302. mtctr r7
  303. b 1b
  304. _GLOBAL(__copy_tofrom_user)
  305. addi r4,r4,-4
  306. addi r6,r3,-4
  307. neg r0,r3
  308. andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
  309. beq 58f
  310. cmplw 0,r5,r0 /* is this more than total to do? */
  311. blt 63f /* if not much to do */
  312. andi. r8,r0,3 /* get it word-aligned first */
  313. mtctr r8
  314. beq+ 61f
  315. 70: lbz r9,4(r4) /* do some bytes */
  316. 71: stb r9,4(r6)
  317. addi r4,r4,1
  318. addi r6,r6,1
  319. bdnz 70b
  320. 61: subf r5,r0,r5
  321. srwi. r0,r0,2
  322. mtctr r0
  323. beq 58f
  324. 72: lwzu r9,4(r4) /* do some words */
  325. 73: stwu r9,4(r6)
  326. bdnz 72b
  327. .section __ex_table,"a"
  328. .align 2
  329. .long 70b,100f
  330. .long 71b,101f
  331. .long 72b,102f
  332. .long 73b,103f
  333. .text
  334. 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
  335. clrlwi r5,r5,32-LG_CACHELINE_BYTES
  336. li r11,4
  337. beq 63f
  338. #ifdef CONFIG_8xx
  339. /* Don't use prefetch on 8xx */
  340. mtctr r0
  341. li r0,0
  342. 53: COPY_16_BYTES_WITHEX(0)
  343. bdnz 53b
  344. #else /* not CONFIG_8xx */
  345. /* Here we decide how far ahead to prefetch the source */
  346. li r3,4
  347. cmpwi r0,1
  348. li r7,0
  349. ble 114f
  350. li r7,1
  351. #if MAX_COPY_PREFETCH > 1
  352. /* Heuristically, for large transfers we prefetch
  353. MAX_COPY_PREFETCH cachelines ahead. For small transfers
  354. we prefetch 1 cacheline ahead. */
  355. cmpwi r0,MAX_COPY_PREFETCH
  356. ble 112f
  357. li r7,MAX_COPY_PREFETCH
  358. 112: mtctr r7
  359. 111: dcbt r3,r4
  360. addi r3,r3,CACHELINE_BYTES
  361. bdnz 111b
  362. #else
  363. dcbt r3,r4
  364. addi r3,r3,CACHELINE_BYTES
  365. #endif /* MAX_COPY_PREFETCH > 1 */
  366. 114: subf r8,r7,r0
  367. mr r0,r7
  368. mtctr r8
  369. 53: dcbt r3,r4
  370. 54: dcbz r11,r6
  371. .section __ex_table,"a"
  372. .align 2
  373. .long 54b,105f
  374. .text
  375. /* the main body of the cacheline loop */
  376. COPY_16_BYTES_WITHEX(0)
  377. #if L1_CACHE_BYTES >= 32
  378. COPY_16_BYTES_WITHEX(1)
  379. #if L1_CACHE_BYTES >= 64
  380. COPY_16_BYTES_WITHEX(2)
  381. COPY_16_BYTES_WITHEX(3)
  382. #if L1_CACHE_BYTES >= 128
  383. COPY_16_BYTES_WITHEX(4)
  384. COPY_16_BYTES_WITHEX(5)
  385. COPY_16_BYTES_WITHEX(6)
  386. COPY_16_BYTES_WITHEX(7)
  387. #endif
  388. #endif
  389. #endif
  390. bdnz 53b
  391. cmpwi r0,0
  392. li r3,4
  393. li r7,0
  394. bne 114b
  395. #endif /* CONFIG_8xx */
  396. 63: srwi. r0,r5,2
  397. mtctr r0
  398. beq 64f
  399. 30: lwzu r0,4(r4)
  400. 31: stwu r0,4(r6)
  401. bdnz 30b
  402. 64: andi. r0,r5,3
  403. mtctr r0
  404. beq+ 65f
  405. 40: lbz r0,4(r4)
  406. 41: stb r0,4(r6)
  407. addi r4,r4,1
  408. addi r6,r6,1
  409. bdnz 40b
  410. 65: li r3,0
  411. blr
  412. /* read fault, initial single-byte copy */
  413. 100: li r9,0
  414. b 90f
  415. /* write fault, initial single-byte copy */
  416. 101: li r9,1
  417. 90: subf r5,r8,r5
  418. li r3,0
  419. b 99f
  420. /* read fault, initial word copy */
  421. 102: li r9,0
  422. b 91f
  423. /* write fault, initial word copy */
  424. 103: li r9,1
  425. 91: li r3,2
  426. b 99f
  427. /*
  428. * this stuff handles faults in the cacheline loop and branches to either
  429. * 104f (if in read part) or 105f (if in write part), after updating r5
  430. */
  431. COPY_16_BYTES_EXCODE(0)
  432. #if L1_CACHE_BYTES >= 32
  433. COPY_16_BYTES_EXCODE(1)
  434. #if L1_CACHE_BYTES >= 64
  435. COPY_16_BYTES_EXCODE(2)
  436. COPY_16_BYTES_EXCODE(3)
  437. #if L1_CACHE_BYTES >= 128
  438. COPY_16_BYTES_EXCODE(4)
  439. COPY_16_BYTES_EXCODE(5)
  440. COPY_16_BYTES_EXCODE(6)
  441. COPY_16_BYTES_EXCODE(7)
  442. #endif
  443. #endif
  444. #endif
  445. /* read fault in cacheline loop */
  446. 104: li r9,0
  447. b 92f
  448. /* fault on dcbz (effectively a write fault) */
  449. /* or write fault in cacheline loop */
  450. 105: li r9,1
  451. 92: li r3,LG_CACHELINE_BYTES
  452. mfctr r8
  453. add r0,r0,r8
  454. b 106f
  455. /* read fault in final word loop */
  456. 108: li r9,0
  457. b 93f
  458. /* write fault in final word loop */
  459. 109: li r9,1
  460. 93: andi. r5,r5,3
  461. li r3,2
  462. b 99f
  463. /* read fault in final byte loop */
  464. 110: li r9,0
  465. b 94f
  466. /* write fault in final byte loop */
  467. 111: li r9,1
  468. 94: li r5,0
  469. li r3,0
  470. /*
  471. * At this stage the number of bytes not copied is
  472. * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
  473. */
  474. 99: mfctr r0
  475. 106: slw r3,r0,r3
  476. add. r3,r3,r5
  477. beq 120f /* shouldn't happen */
  478. cmpwi 0,r9,0
  479. bne 120f
  480. /* for a read fault, first try to continue the copy one byte at a time */
  481. mtctr r3
  482. 130: lbz r0,4(r4)
  483. 131: stb r0,4(r6)
  484. addi r4,r4,1
  485. addi r6,r6,1
  486. bdnz 130b
  487. /* then clear out the destination: r3 bytes starting at 4(r6) */
  488. 132: mfctr r3
  489. srwi. r0,r3,2
  490. li r9,0
  491. mtctr r0
  492. beq 113f
  493. 112: stwu r9,4(r6)
  494. bdnz 112b
  495. 113: andi. r0,r3,3
  496. mtctr r0
  497. beq 120f
  498. 114: stb r9,4(r6)
  499. addi r6,r6,1
  500. bdnz 114b
  501. 120: blr
  502. .section __ex_table,"a"
  503. .align 2
  504. .long 30b,108b
  505. .long 31b,109b
  506. .long 40b,110b
  507. .long 41b,111b
  508. .long 130b,132b
  509. .long 131b,120b
  510. .long 112b,120b
  511. .long 114b,120b
  512. .text