copyuser_power7.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. .macro err1
  22. 100:
  23. .section __ex_table,"a"
  24. .align 3
  25. .llong 100b,.Ldo_err1
  26. .previous
  27. .endm
  28. .macro err2
  29. 200:
  30. .section __ex_table,"a"
  31. .align 3
  32. .llong 200b,.Ldo_err2
  33. .previous
  34. .endm
  35. #ifdef CONFIG_ALTIVEC
  36. .macro err3
  37. 300:
  38. .section __ex_table,"a"
  39. .align 3
  40. .llong 300b,.Ldo_err3
  41. .previous
  42. .endm
  43. .macro err4
  44. 400:
  45. .section __ex_table,"a"
  46. .align 3
  47. .llong 400b,.Ldo_err4
  48. .previous
  49. .endm
  50. .Ldo_err4:
  51. ld r16,STK_REG(R16)(r1)
  52. ld r15,STK_REG(R15)(r1)
  53. ld r14,STK_REG(R14)(r1)
  54. .Ldo_err3:
  55. bl .exit_vmx_usercopy
  56. ld r0,STACKFRAMESIZE+16(r1)
  57. mtlr r0
  58. b .Lexit
  59. #endif /* CONFIG_ALTIVEC */
  60. .Ldo_err2:
  61. ld r22,STK_REG(R22)(r1)
  62. ld r21,STK_REG(R21)(r1)
  63. ld r20,STK_REG(R20)(r1)
  64. ld r19,STK_REG(R19)(r1)
  65. ld r18,STK_REG(R18)(r1)
  66. ld r17,STK_REG(R17)(r1)
  67. ld r16,STK_REG(R16)(r1)
  68. ld r15,STK_REG(R15)(r1)
  69. ld r14,STK_REG(R14)(r1)
  70. .Lexit:
  71. addi r1,r1,STACKFRAMESIZE
  72. .Ldo_err1:
  73. ld r3,48(r1)
  74. ld r4,56(r1)
  75. ld r5,64(r1)
  76. b __copy_tofrom_user_base
  77. _GLOBAL(__copy_tofrom_user_power7)
  78. #ifdef CONFIG_ALTIVEC
  79. cmpldi r5,16
  80. cmpldi cr1,r5,4096
  81. std r3,48(r1)
  82. std r4,56(r1)
  83. std r5,64(r1)
  84. blt .Lshort_copy
  85. bgt cr1,.Lvmx_copy
  86. #else
  87. cmpldi r5,16
  88. std r3,48(r1)
  89. std r4,56(r1)
  90. std r5,64(r1)
  91. blt .Lshort_copy
  92. #endif
  93. .Lnonvmx_copy:
  94. /* Get the source 8B aligned */
  95. neg r6,r4
  96. mtocrf 0x01,r6
  97. clrldi r6,r6,(64-3)
  98. bf cr7*4+3,1f
  99. err1; lbz r0,0(r4)
  100. addi r4,r4,1
  101. err1; stb r0,0(r3)
  102. addi r3,r3,1
  103. 1: bf cr7*4+2,2f
  104. err1; lhz r0,0(r4)
  105. addi r4,r4,2
  106. err1; sth r0,0(r3)
  107. addi r3,r3,2
  108. 2: bf cr7*4+1,3f
  109. err1; lwz r0,0(r4)
  110. addi r4,r4,4
  111. err1; stw r0,0(r3)
  112. addi r3,r3,4
  113. 3: sub r5,r5,r6
  114. cmpldi r5,128
  115. blt 5f
  116. mflr r0
  117. stdu r1,-STACKFRAMESIZE(r1)
  118. std r14,STK_REG(R14)(r1)
  119. std r15,STK_REG(R15)(r1)
  120. std r16,STK_REG(R16)(r1)
  121. std r17,STK_REG(R17)(r1)
  122. std r18,STK_REG(R18)(r1)
  123. std r19,STK_REG(R19)(r1)
  124. std r20,STK_REG(R20)(r1)
  125. std r21,STK_REG(R21)(r1)
  126. std r22,STK_REG(R22)(r1)
  127. std r0,STACKFRAMESIZE+16(r1)
  128. srdi r6,r5,7
  129. mtctr r6
  130. /* Now do cacheline (128B) sized loads and stores. */
  131. .align 5
  132. 4:
  133. err2; ld r0,0(r4)
  134. err2; ld r6,8(r4)
  135. err2; ld r7,16(r4)
  136. err2; ld r8,24(r4)
  137. err2; ld r9,32(r4)
  138. err2; ld r10,40(r4)
  139. err2; ld r11,48(r4)
  140. err2; ld r12,56(r4)
  141. err2; ld r14,64(r4)
  142. err2; ld r15,72(r4)
  143. err2; ld r16,80(r4)
  144. err2; ld r17,88(r4)
  145. err2; ld r18,96(r4)
  146. err2; ld r19,104(r4)
  147. err2; ld r20,112(r4)
  148. err2; ld r21,120(r4)
  149. addi r4,r4,128
  150. err2; std r0,0(r3)
  151. err2; std r6,8(r3)
  152. err2; std r7,16(r3)
  153. err2; std r8,24(r3)
  154. err2; std r9,32(r3)
  155. err2; std r10,40(r3)
  156. err2; std r11,48(r3)
  157. err2; std r12,56(r3)
  158. err2; std r14,64(r3)
  159. err2; std r15,72(r3)
  160. err2; std r16,80(r3)
  161. err2; std r17,88(r3)
  162. err2; std r18,96(r3)
  163. err2; std r19,104(r3)
  164. err2; std r20,112(r3)
  165. err2; std r21,120(r3)
  166. addi r3,r3,128
  167. bdnz 4b
  168. clrldi r5,r5,(64-7)
  169. ld r14,STK_REG(R14)(r1)
  170. ld r15,STK_REG(R15)(r1)
  171. ld r16,STK_REG(R16)(r1)
  172. ld r17,STK_REG(R17)(r1)
  173. ld r18,STK_REG(R18)(r1)
  174. ld r19,STK_REG(R19)(r1)
  175. ld r20,STK_REG(R20)(r1)
  176. ld r21,STK_REG(R21)(r1)
  177. ld r22,STK_REG(R22)(r1)
  178. addi r1,r1,STACKFRAMESIZE
  179. /* Up to 127B to go */
  180. 5: srdi r6,r5,4
  181. mtocrf 0x01,r6
  182. 6: bf cr7*4+1,7f
  183. err1; ld r0,0(r4)
  184. err1; ld r6,8(r4)
  185. err1; ld r7,16(r4)
  186. err1; ld r8,24(r4)
  187. err1; ld r9,32(r4)
  188. err1; ld r10,40(r4)
  189. err1; ld r11,48(r4)
  190. err1; ld r12,56(r4)
  191. addi r4,r4,64
  192. err1; std r0,0(r3)
  193. err1; std r6,8(r3)
  194. err1; std r7,16(r3)
  195. err1; std r8,24(r3)
  196. err1; std r9,32(r3)
  197. err1; std r10,40(r3)
  198. err1; std r11,48(r3)
  199. err1; std r12,56(r3)
  200. addi r3,r3,64
  201. /* Up to 63B to go */
  202. 7: bf cr7*4+2,8f
  203. err1; ld r0,0(r4)
  204. err1; ld r6,8(r4)
  205. err1; ld r7,16(r4)
  206. err1; ld r8,24(r4)
  207. addi r4,r4,32
  208. err1; std r0,0(r3)
  209. err1; std r6,8(r3)
  210. err1; std r7,16(r3)
  211. err1; std r8,24(r3)
  212. addi r3,r3,32
  213. /* Up to 31B to go */
  214. 8: bf cr7*4+3,9f
  215. err1; ld r0,0(r4)
  216. err1; ld r6,8(r4)
  217. addi r4,r4,16
  218. err1; std r0,0(r3)
  219. err1; std r6,8(r3)
  220. addi r3,r3,16
  221. 9: clrldi r5,r5,(64-4)
  222. /* Up to 15B to go */
  223. .Lshort_copy:
  224. mtocrf 0x01,r5
  225. bf cr7*4+0,12f
  226. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  227. err1; lwz r6,4(r4)
  228. addi r4,r4,8
  229. err1; stw r0,0(r3)
  230. err1; stw r6,4(r3)
  231. addi r3,r3,8
  232. 12: bf cr7*4+1,13f
  233. err1; lwz r0,0(r4)
  234. addi r4,r4,4
  235. err1; stw r0,0(r3)
  236. addi r3,r3,4
  237. 13: bf cr7*4+2,14f
  238. err1; lhz r0,0(r4)
  239. addi r4,r4,2
  240. err1; sth r0,0(r3)
  241. addi r3,r3,2
  242. 14: bf cr7*4+3,15f
  243. err1; lbz r0,0(r4)
  244. err1; stb r0,0(r3)
  245. 15: li r3,0
  246. blr
  247. .Lunwind_stack_nonvmx_copy:
  248. addi r1,r1,STACKFRAMESIZE
  249. b .Lnonvmx_copy
  250. #ifdef CONFIG_ALTIVEC
  251. .Lvmx_copy:
  252. mflr r0
  253. std r0,16(r1)
  254. stdu r1,-STACKFRAMESIZE(r1)
  255. bl .enter_vmx_usercopy
  256. cmpwi cr1,r3,0
  257. ld r0,STACKFRAMESIZE+16(r1)
  258. ld r3,STACKFRAMESIZE+48(r1)
  259. ld r4,STACKFRAMESIZE+56(r1)
  260. ld r5,STACKFRAMESIZE+64(r1)
  261. mtlr r0
  262. /*
  263. * We prefetch both the source and destination using enhanced touch
  264. * instructions. We use a stream ID of 0 for the load side and
  265. * 1 for the store side.
  266. */
  267. clrrdi r6,r4,7
  268. clrrdi r9,r3,7
  269. ori r9,r9,1 /* stream=1 */
  270. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  271. cmpldi r7,0x3FF
  272. ble 1f
  273. li r7,0x3FF
  274. 1: lis r0,0x0E00 /* depth=7 */
  275. sldi r7,r7,7
  276. or r7,r7,r0
  277. ori r10,r7,1 /* stream=1 */
  278. lis r8,0x8000 /* GO=1 */
  279. clrldi r8,r8,32
  280. .machine push
  281. .machine "power4"
  282. dcbt r0,r6,0b01000
  283. dcbt r0,r7,0b01010
  284. dcbtst r0,r9,0b01000
  285. dcbtst r0,r10,0b01010
  286. eieio
  287. dcbt r0,r8,0b01010 /* GO */
  288. .machine pop
  289. beq cr1,.Lunwind_stack_nonvmx_copy
  290. /*
  291. * If source and destination are not relatively aligned we use a
  292. * slower permute loop.
  293. */
  294. xor r6,r4,r3
  295. rldicl. r6,r6,0,(64-4)
  296. bne .Lvmx_unaligned_copy
  297. /* Get the destination 16B aligned */
  298. neg r6,r3
  299. mtocrf 0x01,r6
  300. clrldi r6,r6,(64-4)
  301. bf cr7*4+3,1f
  302. err3; lbz r0,0(r4)
  303. addi r4,r4,1
  304. err3; stb r0,0(r3)
  305. addi r3,r3,1
  306. 1: bf cr7*4+2,2f
  307. err3; lhz r0,0(r4)
  308. addi r4,r4,2
  309. err3; sth r0,0(r3)
  310. addi r3,r3,2
  311. 2: bf cr7*4+1,3f
  312. err3; lwz r0,0(r4)
  313. addi r4,r4,4
  314. err3; stw r0,0(r3)
  315. addi r3,r3,4
  316. 3: bf cr7*4+0,4f
  317. err3; ld r0,0(r4)
  318. addi r4,r4,8
  319. err3; std r0,0(r3)
  320. addi r3,r3,8
  321. 4: sub r5,r5,r6
  322. /* Get the desination 128B aligned */
  323. neg r6,r3
  324. srdi r7,r6,4
  325. mtocrf 0x01,r7
  326. clrldi r6,r6,(64-7)
  327. li r9,16
  328. li r10,32
  329. li r11,48
  330. bf cr7*4+3,5f
  331. err3; lvx vr1,r0,r4
  332. addi r4,r4,16
  333. err3; stvx vr1,r0,r3
  334. addi r3,r3,16
  335. 5: bf cr7*4+2,6f
  336. err3; lvx vr1,r0,r4
  337. err3; lvx vr0,r4,r9
  338. addi r4,r4,32
  339. err3; stvx vr1,r0,r3
  340. err3; stvx vr0,r3,r9
  341. addi r3,r3,32
  342. 6: bf cr7*4+1,7f
  343. err3; lvx vr3,r0,r4
  344. err3; lvx vr2,r4,r9
  345. err3; lvx vr1,r4,r10
  346. err3; lvx vr0,r4,r11
  347. addi r4,r4,64
  348. err3; stvx vr3,r0,r3
  349. err3; stvx vr2,r3,r9
  350. err3; stvx vr1,r3,r10
  351. err3; stvx vr0,r3,r11
  352. addi r3,r3,64
  353. 7: sub r5,r5,r6
  354. srdi r6,r5,7
  355. std r14,STK_REG(R14)(r1)
  356. std r15,STK_REG(R15)(r1)
  357. std r16,STK_REG(R16)(r1)
  358. li r12,64
  359. li r14,80
  360. li r15,96
  361. li r16,112
  362. mtctr r6
  363. /*
  364. * Now do cacheline sized loads and stores. By this stage the
  365. * cacheline stores are also cacheline aligned.
  366. */
  367. .align 5
  368. 8:
  369. err4; lvx vr7,r0,r4
  370. err4; lvx vr6,r4,r9
  371. err4; lvx vr5,r4,r10
  372. err4; lvx vr4,r4,r11
  373. err4; lvx vr3,r4,r12
  374. err4; lvx vr2,r4,r14
  375. err4; lvx vr1,r4,r15
  376. err4; lvx vr0,r4,r16
  377. addi r4,r4,128
  378. err4; stvx vr7,r0,r3
  379. err4; stvx vr6,r3,r9
  380. err4; stvx vr5,r3,r10
  381. err4; stvx vr4,r3,r11
  382. err4; stvx vr3,r3,r12
  383. err4; stvx vr2,r3,r14
  384. err4; stvx vr1,r3,r15
  385. err4; stvx vr0,r3,r16
  386. addi r3,r3,128
  387. bdnz 8b
  388. ld r14,STK_REG(R14)(r1)
  389. ld r15,STK_REG(R15)(r1)
  390. ld r16,STK_REG(R16)(r1)
  391. /* Up to 127B to go */
  392. clrldi r5,r5,(64-7)
  393. srdi r6,r5,4
  394. mtocrf 0x01,r6
  395. bf cr7*4+1,9f
  396. err3; lvx vr3,r0,r4
  397. err3; lvx vr2,r4,r9
  398. err3; lvx vr1,r4,r10
  399. err3; lvx vr0,r4,r11
  400. addi r4,r4,64
  401. err3; stvx vr3,r0,r3
  402. err3; stvx vr2,r3,r9
  403. err3; stvx vr1,r3,r10
  404. err3; stvx vr0,r3,r11
  405. addi r3,r3,64
  406. 9: bf cr7*4+2,10f
  407. err3; lvx vr1,r0,r4
  408. err3; lvx vr0,r4,r9
  409. addi r4,r4,32
  410. err3; stvx vr1,r0,r3
  411. err3; stvx vr0,r3,r9
  412. addi r3,r3,32
  413. 10: bf cr7*4+3,11f
  414. err3; lvx vr1,r0,r4
  415. addi r4,r4,16
  416. err3; stvx vr1,r0,r3
  417. addi r3,r3,16
  418. /* Up to 15B to go */
  419. 11: clrldi r5,r5,(64-4)
  420. mtocrf 0x01,r5
  421. bf cr7*4+0,12f
  422. err3; ld r0,0(r4)
  423. addi r4,r4,8
  424. err3; std r0,0(r3)
  425. addi r3,r3,8
  426. 12: bf cr7*4+1,13f
  427. err3; lwz r0,0(r4)
  428. addi r4,r4,4
  429. err3; stw r0,0(r3)
  430. addi r3,r3,4
  431. 13: bf cr7*4+2,14f
  432. err3; lhz r0,0(r4)
  433. addi r4,r4,2
  434. err3; sth r0,0(r3)
  435. addi r3,r3,2
  436. 14: bf cr7*4+3,15f
  437. err3; lbz r0,0(r4)
  438. err3; stb r0,0(r3)
  439. 15: addi r1,r1,STACKFRAMESIZE
  440. b .exit_vmx_usercopy /* tail call optimise */
  441. .Lvmx_unaligned_copy:
  442. /* Get the destination 16B aligned */
  443. neg r6,r3
  444. mtocrf 0x01,r6
  445. clrldi r6,r6,(64-4)
  446. bf cr7*4+3,1f
  447. err3; lbz r0,0(r4)
  448. addi r4,r4,1
  449. err3; stb r0,0(r3)
  450. addi r3,r3,1
  451. 1: bf cr7*4+2,2f
  452. err3; lhz r0,0(r4)
  453. addi r4,r4,2
  454. err3; sth r0,0(r3)
  455. addi r3,r3,2
  456. 2: bf cr7*4+1,3f
  457. err3; lwz r0,0(r4)
  458. addi r4,r4,4
  459. err3; stw r0,0(r3)
  460. addi r3,r3,4
  461. 3: bf cr7*4+0,4f
  462. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  463. err3; lwz r7,4(r4)
  464. addi r4,r4,8
  465. err3; stw r0,0(r3)
  466. err3; stw r7,4(r3)
  467. addi r3,r3,8
  468. 4: sub r5,r5,r6
  469. /* Get the desination 128B aligned */
  470. neg r6,r3
  471. srdi r7,r6,4
  472. mtocrf 0x01,r7
  473. clrldi r6,r6,(64-7)
  474. li r9,16
  475. li r10,32
  476. li r11,48
  477. lvsl vr16,0,r4 /* Setup permute control vector */
  478. err3; lvx vr0,0,r4
  479. addi r4,r4,16
  480. bf cr7*4+3,5f
  481. err3; lvx vr1,r0,r4
  482. vperm vr8,vr0,vr1,vr16
  483. addi r4,r4,16
  484. err3; stvx vr8,r0,r3
  485. addi r3,r3,16
  486. vor vr0,vr1,vr1
  487. 5: bf cr7*4+2,6f
  488. err3; lvx vr1,r0,r4
  489. vperm vr8,vr0,vr1,vr16
  490. err3; lvx vr0,r4,r9
  491. vperm vr9,vr1,vr0,vr16
  492. addi r4,r4,32
  493. err3; stvx vr8,r0,r3
  494. err3; stvx vr9,r3,r9
  495. addi r3,r3,32
  496. 6: bf cr7*4+1,7f
  497. err3; lvx vr3,r0,r4
  498. vperm vr8,vr0,vr3,vr16
  499. err3; lvx vr2,r4,r9
  500. vperm vr9,vr3,vr2,vr16
  501. err3; lvx vr1,r4,r10
  502. vperm vr10,vr2,vr1,vr16
  503. err3; lvx vr0,r4,r11
  504. vperm vr11,vr1,vr0,vr16
  505. addi r4,r4,64
  506. err3; stvx vr8,r0,r3
  507. err3; stvx vr9,r3,r9
  508. err3; stvx vr10,r3,r10
  509. err3; stvx vr11,r3,r11
  510. addi r3,r3,64
  511. 7: sub r5,r5,r6
  512. srdi r6,r5,7
  513. std r14,STK_REG(R14)(r1)
  514. std r15,STK_REG(R15)(r1)
  515. std r16,STK_REG(R16)(r1)
  516. li r12,64
  517. li r14,80
  518. li r15,96
  519. li r16,112
  520. mtctr r6
  521. /*
  522. * Now do cacheline sized loads and stores. By this stage the
  523. * cacheline stores are also cacheline aligned.
  524. */
  525. .align 5
  526. 8:
  527. err4; lvx vr7,r0,r4
  528. vperm vr8,vr0,vr7,vr16
  529. err4; lvx vr6,r4,r9
  530. vperm vr9,vr7,vr6,vr16
  531. err4; lvx vr5,r4,r10
  532. vperm vr10,vr6,vr5,vr16
  533. err4; lvx vr4,r4,r11
  534. vperm vr11,vr5,vr4,vr16
  535. err4; lvx vr3,r4,r12
  536. vperm vr12,vr4,vr3,vr16
  537. err4; lvx vr2,r4,r14
  538. vperm vr13,vr3,vr2,vr16
  539. err4; lvx vr1,r4,r15
  540. vperm vr14,vr2,vr1,vr16
  541. err4; lvx vr0,r4,r16
  542. vperm vr15,vr1,vr0,vr16
  543. addi r4,r4,128
  544. err4; stvx vr8,r0,r3
  545. err4; stvx vr9,r3,r9
  546. err4; stvx vr10,r3,r10
  547. err4; stvx vr11,r3,r11
  548. err4; stvx vr12,r3,r12
  549. err4; stvx vr13,r3,r14
  550. err4; stvx vr14,r3,r15
  551. err4; stvx vr15,r3,r16
  552. addi r3,r3,128
  553. bdnz 8b
  554. ld r14,STK_REG(R14)(r1)
  555. ld r15,STK_REG(R15)(r1)
  556. ld r16,STK_REG(R16)(r1)
  557. /* Up to 127B to go */
  558. clrldi r5,r5,(64-7)
  559. srdi r6,r5,4
  560. mtocrf 0x01,r6
  561. bf cr7*4+1,9f
  562. err3; lvx vr3,r0,r4
  563. vperm vr8,vr0,vr3,vr16
  564. err3; lvx vr2,r4,r9
  565. vperm vr9,vr3,vr2,vr16
  566. err3; lvx vr1,r4,r10
  567. vperm vr10,vr2,vr1,vr16
  568. err3; lvx vr0,r4,r11
  569. vperm vr11,vr1,vr0,vr16
  570. addi r4,r4,64
  571. err3; stvx vr8,r0,r3
  572. err3; stvx vr9,r3,r9
  573. err3; stvx vr10,r3,r10
  574. err3; stvx vr11,r3,r11
  575. addi r3,r3,64
  576. 9: bf cr7*4+2,10f
  577. err3; lvx vr1,r0,r4
  578. vperm vr8,vr0,vr1,vr16
  579. err3; lvx vr0,r4,r9
  580. vperm vr9,vr1,vr0,vr16
  581. addi r4,r4,32
  582. err3; stvx vr8,r0,r3
  583. err3; stvx vr9,r3,r9
  584. addi r3,r3,32
  585. 10: bf cr7*4+3,11f
  586. err3; lvx vr1,r0,r4
  587. vperm vr8,vr0,vr1,vr16
  588. addi r4,r4,16
  589. err3; stvx vr8,r0,r3
  590. addi r3,r3,16
  591. /* Up to 15B to go */
  592. 11: clrldi r5,r5,(64-4)
  593. addi r4,r4,-16 /* Unwind the +16 load offset */
  594. mtocrf 0x01,r5
  595. bf cr7*4+0,12f
  596. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  597. err3; lwz r6,4(r4)
  598. addi r4,r4,8
  599. err3; stw r0,0(r3)
  600. err3; stw r6,4(r3)
  601. addi r3,r3,8
  602. 12: bf cr7*4+1,13f
  603. err3; lwz r0,0(r4)
  604. addi r4,r4,4
  605. err3; stw r0,0(r3)
  606. addi r3,r3,4
  607. 13: bf cr7*4+2,14f
  608. err3; lhz r0,0(r4)
  609. addi r4,r4,2
  610. err3; sth r0,0(r3)
  611. addi r3,r3,2
  612. 14: bf cr7*4+3,15f
  613. err3; lbz r0,0(r4)
  614. err3; stb r0,0(r3)
  615. 15: addi r1,r1,STACKFRAMESIZE
  616. b .exit_vmx_usercopy /* tail call optimise */
  617. #endif /* CONFiG_ALTIVEC */