copyuser_64.S 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. /*
  2. * arch/ppc64/lib/copyuser.S
  3. *
  4. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. */
  11. #include <asm/processor.h>
  12. #include <asm/ppc_asm.h>
  13. .align 7
  14. _GLOBAL(__copy_tofrom_user)
  15. /* first check for a whole page copy on a page boundary */
  16. cmpldi cr1,r5,16
  17. cmpdi cr6,r5,4096
  18. or r0,r3,r4
  19. neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
  20. andi. r0,r0,4095
  21. std r3,-24(r1)
  22. crand cr0*4+2,cr0*4+2,cr6*4+2
  23. std r4,-16(r1)
  24. std r5,-8(r1)
  25. dcbt 0,r4
  26. beq .Lcopy_page_4K
  27. andi. r6,r6,7
  28. mtcrf 0x01,r5
  29. blt cr1,.Lshort_copy
  30. bne .Ldst_unaligned
  31. .Ldst_aligned:
  32. andi. r0,r4,7
  33. addi r3,r3,-16
  34. bne .Lsrc_unaligned
  35. srdi r7,r5,4
  36. 20: ld r9,0(r4)
  37. addi r4,r4,-8
  38. mtctr r7
  39. andi. r5,r5,7
  40. bf cr7*4+0,22f
  41. addi r3,r3,8
  42. addi r4,r4,8
  43. mr r8,r9
  44. blt cr1,72f
  45. 21: ld r9,8(r4)
  46. 70: std r8,8(r3)
  47. 22: ldu r8,16(r4)
  48. 71: stdu r9,16(r3)
  49. bdnz 21b
  50. 72: std r8,8(r3)
  51. beq+ 3f
  52. addi r3,r3,16
  53. 23: ld r9,8(r4)
  54. .Ldo_tail:
  55. bf cr7*4+1,1f
  56. rotldi r9,r9,32
  57. 73: stw r9,0(r3)
  58. addi r3,r3,4
  59. 1: bf cr7*4+2,2f
  60. rotldi r9,r9,16
  61. 74: sth r9,0(r3)
  62. addi r3,r3,2
  63. 2: bf cr7*4+3,3f
  64. rotldi r9,r9,8
  65. 75: stb r9,0(r3)
  66. 3: li r3,0
  67. blr
  68. .Lsrc_unaligned:
  69. srdi r6,r5,3
  70. addi r5,r5,-16
  71. subf r4,r0,r4
  72. srdi r7,r5,4
  73. sldi r10,r0,3
  74. cmpldi cr6,r6,3
  75. andi. r5,r5,7
  76. mtctr r7
  77. subfic r11,r10,64
  78. add r5,r5,r0
  79. bt cr7*4+0,28f
  80. 24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */
  81. 25: ld r0,8(r4)
  82. sld r6,r9,r10
  83. 26: ldu r9,16(r4)
  84. srd r7,r0,r11
  85. sld r8,r0,r10
  86. or r7,r7,r6
  87. blt cr6,79f
  88. 27: ld r0,8(r4)
  89. b 2f
  90. 28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */
  91. 29: ldu r9,8(r4)
  92. sld r8,r0,r10
  93. addi r3,r3,-8
  94. blt cr6,5f
  95. 30: ld r0,8(r4)
  96. srd r12,r9,r11
  97. sld r6,r9,r10
  98. 31: ldu r9,16(r4)
  99. or r12,r8,r12
  100. srd r7,r0,r11
  101. sld r8,r0,r10
  102. addi r3,r3,16
  103. beq cr6,78f
  104. 1: or r7,r7,r6
  105. 32: ld r0,8(r4)
  106. 76: std r12,8(r3)
  107. 2: srd r12,r9,r11
  108. sld r6,r9,r10
  109. 33: ldu r9,16(r4)
  110. or r12,r8,r12
  111. 77: stdu r7,16(r3)
  112. srd r7,r0,r11
  113. sld r8,r0,r10
  114. bdnz 1b
  115. 78: std r12,8(r3)
  116. or r7,r7,r6
  117. 79: std r7,16(r3)
  118. 5: srd r12,r9,r11
  119. or r12,r8,r12
  120. 80: std r12,24(r3)
  121. bne 6f
  122. li r3,0
  123. blr
  124. 6: cmpwi cr1,r5,8
  125. addi r3,r3,32
  126. sld r9,r9,r10
  127. ble cr1,.Ldo_tail
  128. 34: ld r0,8(r4)
  129. srd r7,r0,r11
  130. or r9,r7,r9
  131. b .Ldo_tail
  132. .Ldst_unaligned:
  133. mtcrf 0x01,r6 /* put #bytes to 8B bdry into cr7 */
  134. subf r5,r6,r5
  135. li r7,0
  136. cmpldi r1,r5,16
  137. bf cr7*4+3,1f
  138. 35: lbz r0,0(r4)
  139. 81: stb r0,0(r3)
  140. addi r7,r7,1
  141. 1: bf cr7*4+2,2f
  142. 36: lhzx r0,r7,r4
  143. 82: sthx r0,r7,r3
  144. addi r7,r7,2
  145. 2: bf cr7*4+1,3f
  146. 37: lwzx r0,r7,r4
  147. 83: stwx r0,r7,r3
  148. 3: mtcrf 0x01,r5
  149. add r4,r6,r4
  150. add r3,r6,r3
  151. b .Ldst_aligned
  152. .Lshort_copy:
  153. bf cr7*4+0,1f
  154. 38: lwz r0,0(r4)
  155. 39: lwz r9,4(r4)
  156. addi r4,r4,8
  157. 84: stw r0,0(r3)
  158. 85: stw r9,4(r3)
  159. addi r3,r3,8
  160. 1: bf cr7*4+1,2f
  161. 40: lwz r0,0(r4)
  162. addi r4,r4,4
  163. 86: stw r0,0(r3)
  164. addi r3,r3,4
  165. 2: bf cr7*4+2,3f
  166. 41: lhz r0,0(r4)
  167. addi r4,r4,2
  168. 87: sth r0,0(r3)
  169. addi r3,r3,2
  170. 3: bf cr7*4+3,4f
  171. 42: lbz r0,0(r4)
  172. 88: stb r0,0(r3)
  173. 4: li r3,0
  174. blr
  175. /*
  176. * exception handlers follow
  177. * we have to return the number of bytes not copied
  178. * for an exception on a load, we set the rest of the destination to 0
  179. */
  180. 136:
  181. 137:
  182. add r3,r3,r7
  183. b 1f
  184. 130:
  185. 131:
  186. addi r3,r3,8
  187. 120:
  188. 122:
  189. 124:
  190. 125:
  191. 126:
  192. 127:
  193. 128:
  194. 129:
  195. 133:
  196. addi r3,r3,8
  197. 121:
  198. 132:
  199. addi r3,r3,8
  200. 123:
  201. 134:
  202. 135:
  203. 138:
  204. 139:
  205. 140:
  206. 141:
  207. 142:
  208. /*
  209. * here we have had a fault on a load and r3 points to the first
  210. * unmodified byte of the destination
  211. */
  212. 1: ld r6,-24(r1)
  213. ld r4,-16(r1)
  214. ld r5,-8(r1)
  215. subf r6,r6,r3
  216. add r4,r4,r6
  217. subf r5,r6,r5 /* #bytes left to go */
  218. /*
  219. * first see if we can copy any more bytes before hitting another exception
  220. */
  221. mtctr r5
  222. 43: lbz r0,0(r4)
  223. addi r4,r4,1
  224. 89: stb r0,0(r3)
  225. addi r3,r3,1
  226. bdnz 43b
  227. li r3,0 /* huh? all copied successfully this time? */
  228. blr
  229. /*
  230. * here we have trapped again, need to clear ctr bytes starting at r3
  231. */
  232. 143: mfctr r5
  233. li r0,0
  234. mr r4,r3
  235. mr r3,r5 /* return the number of bytes not copied */
  236. 1: andi. r9,r4,7
  237. beq 3f
  238. 90: stb r0,0(r4)
  239. addic. r5,r5,-1
  240. addi r4,r4,1
  241. bne 1b
  242. blr
  243. 3: cmpldi cr1,r5,8
  244. srdi r9,r5,3
  245. andi. r5,r5,7
  246. blt cr1,93f
  247. mtctr r9
  248. 91: std r0,0(r4)
  249. addi r4,r4,8
  250. bdnz 91b
  251. 93: beqlr
  252. mtctr r5
  253. 92: stb r0,0(r4)
  254. addi r4,r4,1
  255. bdnz 92b
  256. blr
  257. /*
  258. * exception handlers for stores: we just need to work
  259. * out how many bytes weren't copied
  260. */
  261. 182:
  262. 183:
  263. add r3,r3,r7
  264. b 1f
  265. 180:
  266. addi r3,r3,8
  267. 171:
  268. 177:
  269. addi r3,r3,8
  270. 170:
  271. 172:
  272. 176:
  273. 178:
  274. addi r3,r3,4
  275. 185:
  276. addi r3,r3,4
  277. 173:
  278. 174:
  279. 175:
  280. 179:
  281. 181:
  282. 184:
  283. 186:
  284. 187:
  285. 188:
  286. 189:
  287. 1:
  288. ld r6,-24(r1)
  289. ld r5,-8(r1)
  290. add r6,r6,r5
  291. subf r3,r3,r6 /* #bytes not copied */
  292. 190:
  293. 191:
  294. 192:
  295. blr /* #bytes not copied in r3 */
  296. .section __ex_table,"a"
  297. .align 3
  298. .llong 20b,120b
  299. .llong 21b,121b
  300. .llong 70b,170b
  301. .llong 22b,122b
  302. .llong 71b,171b
  303. .llong 72b,172b
  304. .llong 23b,123b
  305. .llong 73b,173b
  306. .llong 74b,174b
  307. .llong 75b,175b
  308. .llong 24b,124b
  309. .llong 25b,125b
  310. .llong 26b,126b
  311. .llong 27b,127b
  312. .llong 28b,128b
  313. .llong 29b,129b
  314. .llong 30b,130b
  315. .llong 31b,131b
  316. .llong 32b,132b
  317. .llong 76b,176b
  318. .llong 33b,133b
  319. .llong 77b,177b
  320. .llong 78b,178b
  321. .llong 79b,179b
  322. .llong 80b,180b
  323. .llong 34b,134b
  324. .llong 35b,135b
  325. .llong 81b,181b
  326. .llong 36b,136b
  327. .llong 82b,182b
  328. .llong 37b,137b
  329. .llong 83b,183b
  330. .llong 38b,138b
  331. .llong 39b,139b
  332. .llong 84b,184b
  333. .llong 85b,185b
  334. .llong 40b,140b
  335. .llong 86b,186b
  336. .llong 41b,141b
  337. .llong 87b,187b
  338. .llong 42b,142b
  339. .llong 88b,188b
  340. .llong 43b,143b
  341. .llong 89b,189b
  342. .llong 90b,190b
  343. .llong 91b,191b
  344. .llong 92b,192b
  345. .text
  346. /*
  347. * Routine to copy a whole page of data, optimized for POWER4.
  348. * On POWER4 it is more than 50% faster than the simple loop
  349. * above (following the .Ldst_aligned label) but it runs slightly
  350. * slower on POWER3.
  351. */
  352. .Lcopy_page_4K:
  353. std r31,-32(1)
  354. std r30,-40(1)
  355. std r29,-48(1)
  356. std r28,-56(1)
  357. std r27,-64(1)
  358. std r26,-72(1)
  359. std r25,-80(1)
  360. std r24,-88(1)
  361. std r23,-96(1)
  362. std r22,-104(1)
  363. std r21,-112(1)
  364. std r20,-120(1)
  365. li r5,4096/32 - 1
  366. addi r3,r3,-8
  367. li r0,5
  368. 0: addi r5,r5,-24
  369. mtctr r0
  370. 20: ld r22,640(4)
  371. 21: ld r21,512(4)
  372. 22: ld r20,384(4)
  373. 23: ld r11,256(4)
  374. 24: ld r9,128(4)
  375. 25: ld r7,0(4)
  376. 26: ld r25,648(4)
  377. 27: ld r24,520(4)
  378. 28: ld r23,392(4)
  379. 29: ld r10,264(4)
  380. 30: ld r8,136(4)
  381. 31: ldu r6,8(4)
  382. cmpwi r5,24
  383. 1:
  384. 32: std r22,648(3)
  385. 33: std r21,520(3)
  386. 34: std r20,392(3)
  387. 35: std r11,264(3)
  388. 36: std r9,136(3)
  389. 37: std r7,8(3)
  390. 38: ld r28,648(4)
  391. 39: ld r27,520(4)
  392. 40: ld r26,392(4)
  393. 41: ld r31,264(4)
  394. 42: ld r30,136(4)
  395. 43: ld r29,8(4)
  396. 44: std r25,656(3)
  397. 45: std r24,528(3)
  398. 46: std r23,400(3)
  399. 47: std r10,272(3)
  400. 48: std r8,144(3)
  401. 49: std r6,16(3)
  402. 50: ld r22,656(4)
  403. 51: ld r21,528(4)
  404. 52: ld r20,400(4)
  405. 53: ld r11,272(4)
  406. 54: ld r9,144(4)
  407. 55: ld r7,16(4)
  408. 56: std r28,664(3)
  409. 57: std r27,536(3)
  410. 58: std r26,408(3)
  411. 59: std r31,280(3)
  412. 60: std r30,152(3)
  413. 61: stdu r29,24(3)
  414. 62: ld r25,664(4)
  415. 63: ld r24,536(4)
  416. 64: ld r23,408(4)
  417. 65: ld r10,280(4)
  418. 66: ld r8,152(4)
  419. 67: ldu r6,24(4)
  420. bdnz 1b
  421. 68: std r22,648(3)
  422. 69: std r21,520(3)
  423. 70: std r20,392(3)
  424. 71: std r11,264(3)
  425. 72: std r9,136(3)
  426. 73: std r7,8(3)
  427. 74: addi r4,r4,640
  428. 75: addi r3,r3,648
  429. bge 0b
  430. mtctr r5
  431. 76: ld r7,0(4)
  432. 77: ld r8,8(4)
  433. 78: ldu r9,16(4)
  434. 3:
  435. 79: ld r10,8(4)
  436. 80: std r7,8(3)
  437. 81: ld r7,16(4)
  438. 82: std r8,16(3)
  439. 83: ld r8,24(4)
  440. 84: std r9,24(3)
  441. 85: ldu r9,32(4)
  442. 86: stdu r10,32(3)
  443. bdnz 3b
  444. 4:
  445. 87: ld r10,8(4)
  446. 88: std r7,8(3)
  447. 89: std r8,16(3)
  448. 90: std r9,24(3)
  449. 91: std r10,32(3)
  450. 9: ld r20,-120(1)
  451. ld r21,-112(1)
  452. ld r22,-104(1)
  453. ld r23,-96(1)
  454. ld r24,-88(1)
  455. ld r25,-80(1)
  456. ld r26,-72(1)
  457. ld r27,-64(1)
  458. ld r28,-56(1)
  459. ld r29,-48(1)
  460. ld r30,-40(1)
  461. ld r31,-32(1)
  462. li r3,0
  463. blr
  464. /*
  465. * on an exception, reset to the beginning and jump back into the
  466. * standard __copy_tofrom_user
  467. */
  468. 100: ld r20,-120(1)
  469. ld r21,-112(1)
  470. ld r22,-104(1)
  471. ld r23,-96(1)
  472. ld r24,-88(1)
  473. ld r25,-80(1)
  474. ld r26,-72(1)
  475. ld r27,-64(1)
  476. ld r28,-56(1)
  477. ld r29,-48(1)
  478. ld r30,-40(1)
  479. ld r31,-32(1)
  480. ld r3,-24(r1)
  481. ld r4,-16(r1)
  482. li r5,4096
  483. b .Ldst_aligned
  484. .section __ex_table,"a"
  485. .align 3
  486. .llong 20b,100b
  487. .llong 21b,100b
  488. .llong 22b,100b
  489. .llong 23b,100b
  490. .llong 24b,100b
  491. .llong 25b,100b
  492. .llong 26b,100b
  493. .llong 27b,100b
  494. .llong 28b,100b
  495. .llong 29b,100b
  496. .llong 30b,100b
  497. .llong 31b,100b
  498. .llong 32b,100b
  499. .llong 33b,100b
  500. .llong 34b,100b
  501. .llong 35b,100b
  502. .llong 36b,100b
  503. .llong 37b,100b
  504. .llong 38b,100b
  505. .llong 39b,100b
  506. .llong 40b,100b
  507. .llong 41b,100b
  508. .llong 42b,100b
  509. .llong 43b,100b
  510. .llong 44b,100b
  511. .llong 45b,100b
  512. .llong 46b,100b
  513. .llong 47b,100b
  514. .llong 48b,100b
  515. .llong 49b,100b
  516. .llong 50b,100b
  517. .llong 51b,100b
  518. .llong 52b,100b
  519. .llong 53b,100b
  520. .llong 54b,100b
  521. .llong 55b,100b
  522. .llong 56b,100b
  523. .llong 57b,100b
  524. .llong 58b,100b
  525. .llong 59b,100b
  526. .llong 60b,100b
  527. .llong 61b,100b
  528. .llong 62b,100b
  529. .llong 63b,100b
  530. .llong 64b,100b
  531. .llong 65b,100b
  532. .llong 66b,100b
  533. .llong 67b,100b
  534. .llong 68b,100b
  535. .llong 69b,100b
  536. .llong 70b,100b
  537. .llong 71b,100b
  538. .llong 72b,100b
  539. .llong 73b,100b
  540. .llong 74b,100b
  541. .llong 75b,100b
  542. .llong 76b,100b
  543. .llong 77b,100b
  544. .llong 78b,100b
  545. .llong 79b,100b
  546. .llong 80b,100b
  547. .llong 81b,100b
  548. .llong 82b,100b
  549. .llong 83b,100b
  550. .llong 84b,100b
  551. .llong 85b,100b
  552. .llong 86b,100b
  553. .llong 87b,100b
  554. .llong 88b,100b
  555. .llong 89b,100b
  556. .llong 90b,100b
  557. .llong 91b,100b