copyuser_power7.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. #define STACKFRAMESIZE 256
  22. #define STK_REG(i) (112 + ((i)-14)*8)
  23. .macro err1
  24. 100:
  25. .section __ex_table,"a"
  26. .align 3
  27. .llong 100b,.Ldo_err1
  28. .previous
  29. .endm
  30. .macro err2
  31. 200:
  32. .section __ex_table,"a"
  33. .align 3
  34. .llong 200b,.Ldo_err2
  35. .previous
  36. .endm
  37. #ifdef CONFIG_ALTIVEC
  38. .macro err3
  39. 300:
  40. .section __ex_table,"a"
  41. .align 3
  42. .llong 300b,.Ldo_err3
  43. .previous
  44. .endm
  45. .macro err4
  46. 400:
  47. .section __ex_table,"a"
  48. .align 3
  49. .llong 400b,.Ldo_err4
  50. .previous
  51. .endm
  52. .Ldo_err4:
  53. ld r16,STK_REG(R16)(r1)
  54. ld r15,STK_REG(R15)(r1)
  55. ld r14,STK_REG(R14)(r1)
  56. .Ldo_err3:
  57. bl .exit_vmx_usercopy
  58. ld r0,STACKFRAMESIZE+16(r1)
  59. mtlr r0
  60. b .Lexit
  61. #endif /* CONFIG_ALTIVEC */
  62. .Ldo_err2:
  63. ld r22,STK_REG(R22)(r1)
  64. ld r21,STK_REG(R21)(r1)
  65. ld r20,STK_REG(R20)(r1)
  66. ld r19,STK_REG(R19)(r1)
  67. ld r18,STK_REG(R18)(r1)
  68. ld r17,STK_REG(R17)(r1)
  69. ld r16,STK_REG(R16)(r1)
  70. ld r15,STK_REG(R15)(r1)
  71. ld r14,STK_REG(R14)(r1)
  72. .Lexit:
  73. addi r1,r1,STACKFRAMESIZE
  74. .Ldo_err1:
  75. ld r3,48(r1)
  76. ld r4,56(r1)
  77. ld r5,64(r1)
  78. b __copy_tofrom_user_base
  79. _GLOBAL(__copy_tofrom_user_power7)
  80. #ifdef CONFIG_ALTIVEC
  81. cmpldi r5,16
  82. cmpldi cr1,r5,4096
  83. std r3,48(r1)
  84. std r4,56(r1)
  85. std r5,64(r1)
  86. blt .Lshort_copy
  87. bgt cr1,.Lvmx_copy
  88. #else
  89. cmpldi r5,16
  90. std r3,48(r1)
  91. std r4,56(r1)
  92. std r5,64(r1)
  93. blt .Lshort_copy
  94. #endif
  95. .Lnonvmx_copy:
  96. /* Get the source 8B aligned */
  97. neg r6,r4
  98. mtocrf 0x01,r6
  99. clrldi r6,r6,(64-3)
  100. bf cr7*4+3,1f
  101. err1; lbz r0,0(r4)
  102. addi r4,r4,1
  103. err1; stb r0,0(r3)
  104. addi r3,r3,1
  105. 1: bf cr7*4+2,2f
  106. err1; lhz r0,0(r4)
  107. addi r4,r4,2
  108. err1; sth r0,0(r3)
  109. addi r3,r3,2
  110. 2: bf cr7*4+1,3f
  111. err1; lwz r0,0(r4)
  112. addi r4,r4,4
  113. err1; stw r0,0(r3)
  114. addi r3,r3,4
  115. 3: sub r5,r5,r6
  116. cmpldi r5,128
  117. blt 5f
  118. mflr r0
  119. stdu r1,-STACKFRAMESIZE(r1)
  120. std r14,STK_REG(R14)(r1)
  121. std r15,STK_REG(R15)(r1)
  122. std r16,STK_REG(R16)(r1)
  123. std r17,STK_REG(R17)(r1)
  124. std r18,STK_REG(R18)(r1)
  125. std r19,STK_REG(R19)(r1)
  126. std r20,STK_REG(R20)(r1)
  127. std r21,STK_REG(R21)(r1)
  128. std r22,STK_REG(R22)(r1)
  129. std r0,STACKFRAMESIZE+16(r1)
  130. srdi r6,r5,7
  131. mtctr r6
  132. /* Now do cacheline (128B) sized loads and stores. */
  133. .align 5
  134. 4:
  135. err2; ld r0,0(r4)
  136. err2; ld r6,8(r4)
  137. err2; ld r7,16(r4)
  138. err2; ld r8,24(r4)
  139. err2; ld r9,32(r4)
  140. err2; ld r10,40(r4)
  141. err2; ld r11,48(r4)
  142. err2; ld r12,56(r4)
  143. err2; ld r14,64(r4)
  144. err2; ld r15,72(r4)
  145. err2; ld r16,80(r4)
  146. err2; ld r17,88(r4)
  147. err2; ld r18,96(r4)
  148. err2; ld r19,104(r4)
  149. err2; ld r20,112(r4)
  150. err2; ld r21,120(r4)
  151. addi r4,r4,128
  152. err2; std r0,0(r3)
  153. err2; std r6,8(r3)
  154. err2; std r7,16(r3)
  155. err2; std r8,24(r3)
  156. err2; std r9,32(r3)
  157. err2; std r10,40(r3)
  158. err2; std r11,48(r3)
  159. err2; std r12,56(r3)
  160. err2; std r14,64(r3)
  161. err2; std r15,72(r3)
  162. err2; std r16,80(r3)
  163. err2; std r17,88(r3)
  164. err2; std r18,96(r3)
  165. err2; std r19,104(r3)
  166. err2; std r20,112(r3)
  167. err2; std r21,120(r3)
  168. addi r3,r3,128
  169. bdnz 4b
  170. clrldi r5,r5,(64-7)
  171. ld r14,STK_REG(R14)(r1)
  172. ld r15,STK_REG(R15)(r1)
  173. ld r16,STK_REG(R16)(r1)
  174. ld r17,STK_REG(R17)(r1)
  175. ld r18,STK_REG(R18)(r1)
  176. ld r19,STK_REG(R19)(r1)
  177. ld r20,STK_REG(R20)(r1)
  178. ld r21,STK_REG(R21)(r1)
  179. ld r22,STK_REG(R22)(r1)
  180. addi r1,r1,STACKFRAMESIZE
  181. /* Up to 127B to go */
  182. 5: srdi r6,r5,4
  183. mtocrf 0x01,r6
  184. 6: bf cr7*4+1,7f
  185. err1; ld r0,0(r4)
  186. err1; ld r6,8(r4)
  187. err1; ld r7,16(r4)
  188. err1; ld r8,24(r4)
  189. err1; ld r9,32(r4)
  190. err1; ld r10,40(r4)
  191. err1; ld r11,48(r4)
  192. err1; ld r12,56(r4)
  193. addi r4,r4,64
  194. err1; std r0,0(r3)
  195. err1; std r6,8(r3)
  196. err1; std r7,16(r3)
  197. err1; std r8,24(r3)
  198. err1; std r9,32(r3)
  199. err1; std r10,40(r3)
  200. err1; std r11,48(r3)
  201. err1; std r12,56(r3)
  202. addi r3,r3,64
  203. /* Up to 63B to go */
  204. 7: bf cr7*4+2,8f
  205. err1; ld r0,0(r4)
  206. err1; ld r6,8(r4)
  207. err1; ld r7,16(r4)
  208. err1; ld r8,24(r4)
  209. addi r4,r4,32
  210. err1; std r0,0(r3)
  211. err1; std r6,8(r3)
  212. err1; std r7,16(r3)
  213. err1; std r8,24(r3)
  214. addi r3,r3,32
  215. /* Up to 31B to go */
  216. 8: bf cr7*4+3,9f
  217. err1; ld r0,0(r4)
  218. err1; ld r6,8(r4)
  219. addi r4,r4,16
  220. err1; std r0,0(r3)
  221. err1; std r6,8(r3)
  222. addi r3,r3,16
  223. 9: clrldi r5,r5,(64-4)
  224. /* Up to 15B to go */
  225. .Lshort_copy:
  226. mtocrf 0x01,r5
  227. bf cr7*4+0,12f
  228. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  229. err1; lwz r6,4(r4)
  230. addi r4,r4,8
  231. err1; stw r0,0(r3)
  232. err1; stw r6,4(r3)
  233. addi r3,r3,8
  234. 12: bf cr7*4+1,13f
  235. err1; lwz r0,0(r4)
  236. addi r4,r4,4
  237. err1; stw r0,0(r3)
  238. addi r3,r3,4
  239. 13: bf cr7*4+2,14f
  240. err1; lhz r0,0(r4)
  241. addi r4,r4,2
  242. err1; sth r0,0(r3)
  243. addi r3,r3,2
  244. 14: bf cr7*4+3,15f
  245. err1; lbz r0,0(r4)
  246. err1; stb r0,0(r3)
  247. 15: li r3,0
  248. blr
  249. .Lunwind_stack_nonvmx_copy:
  250. addi r1,r1,STACKFRAMESIZE
  251. b .Lnonvmx_copy
  252. #ifdef CONFIG_ALTIVEC
  253. .Lvmx_copy:
  254. mflr r0
  255. std r0,16(r1)
  256. stdu r1,-STACKFRAMESIZE(r1)
  257. bl .enter_vmx_usercopy
  258. cmpwi r3,0
  259. ld r0,STACKFRAMESIZE+16(r1)
  260. ld r3,STACKFRAMESIZE+48(r1)
  261. ld r4,STACKFRAMESIZE+56(r1)
  262. ld r5,STACKFRAMESIZE+64(r1)
  263. mtlr r0
  264. /*
  265. * We prefetch both the source and destination using enhanced touch
  266. * instructions. We use a stream ID of 0 for the load side and
  267. * 1 for the store side.
  268. */
  269. clrrdi r6,r4,7
  270. clrrdi r9,r3,7
  271. ori r9,r9,1 /* stream=1 */
  272. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  273. cmpldi r7,0x3FF
  274. ble 1f
  275. li r7,0x3FF
  276. 1: lis r0,0x0E00 /* depth=7 */
  277. sldi r7,r7,7
  278. or r7,r7,r0
  279. ori r10,r7,1 /* stream=1 */
  280. lis r8,0x8000 /* GO=1 */
  281. clrldi r8,r8,32
  282. .machine push
  283. .machine "power4"
  284. dcbt r0,r6,0b01000
  285. dcbt r0,r7,0b01010
  286. dcbtst r0,r9,0b01000
  287. dcbtst r0,r10,0b01010
  288. eieio
  289. dcbt r0,r8,0b01010 /* GO */
  290. .machine pop
  291. /*
  292. * We prefetch both the source and destination using enhanced touch
  293. * instructions. We use a stream ID of 0 for the load side and
  294. * 1 for the store side.
  295. */
  296. clrrdi r6,r4,7
  297. clrrdi r9,r3,7
  298. ori r9,r9,1 /* stream=1 */
  299. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  300. cmpldi cr1,r7,0x3FF
  301. ble cr1,1f
  302. li r7,0x3FF
  303. 1: lis r0,0x0E00 /* depth=7 */
  304. sldi r7,r7,7
  305. or r7,r7,r0
  306. ori r10,r7,1 /* stream=1 */
  307. lis r8,0x8000 /* GO=1 */
  308. clrldi r8,r8,32
  309. .machine push
  310. .machine "power4"
  311. dcbt r0,r6,0b01000
  312. dcbt r0,r7,0b01010
  313. dcbtst r0,r9,0b01000
  314. dcbtst r0,r10,0b01010
  315. eieio
  316. dcbt r0,r8,0b01010 /* GO */
  317. .machine pop
  318. beq .Lunwind_stack_nonvmx_copy
  319. /*
  320. * If source and destination are not relatively aligned we use a
  321. * slower permute loop.
  322. */
  323. xor r6,r4,r3
  324. rldicl. r6,r6,0,(64-4)
  325. bne .Lvmx_unaligned_copy
  326. /* Get the destination 16B aligned */
  327. neg r6,r3
  328. mtocrf 0x01,r6
  329. clrldi r6,r6,(64-4)
  330. bf cr7*4+3,1f
  331. err3; lbz r0,0(r4)
  332. addi r4,r4,1
  333. err3; stb r0,0(r3)
  334. addi r3,r3,1
  335. 1: bf cr7*4+2,2f
  336. err3; lhz r0,0(r4)
  337. addi r4,r4,2
  338. err3; sth r0,0(r3)
  339. addi r3,r3,2
  340. 2: bf cr7*4+1,3f
  341. err3; lwz r0,0(r4)
  342. addi r4,r4,4
  343. err3; stw r0,0(r3)
  344. addi r3,r3,4
  345. 3: bf cr7*4+0,4f
  346. err3; ld r0,0(r4)
  347. addi r4,r4,8
  348. err3; std r0,0(r3)
  349. addi r3,r3,8
  350. 4: sub r5,r5,r6
  351. /* Get the desination 128B aligned */
  352. neg r6,r3
  353. srdi r7,r6,4
  354. mtocrf 0x01,r7
  355. clrldi r6,r6,(64-7)
  356. li r9,16
  357. li r10,32
  358. li r11,48
  359. bf cr7*4+3,5f
  360. err3; lvx vr1,r0,r4
  361. addi r4,r4,16
  362. err3; stvx vr1,r0,r3
  363. addi r3,r3,16
  364. 5: bf cr7*4+2,6f
  365. err3; lvx vr1,r0,r4
  366. err3; lvx vr0,r4,r9
  367. addi r4,r4,32
  368. err3; stvx vr1,r0,r3
  369. err3; stvx vr0,r3,r9
  370. addi r3,r3,32
  371. 6: bf cr7*4+1,7f
  372. err3; lvx vr3,r0,r4
  373. err3; lvx vr2,r4,r9
  374. err3; lvx vr1,r4,r10
  375. err3; lvx vr0,r4,r11
  376. addi r4,r4,64
  377. err3; stvx vr3,r0,r3
  378. err3; stvx vr2,r3,r9
  379. err3; stvx vr1,r3,r10
  380. err3; stvx vr0,r3,r11
  381. addi r3,r3,64
  382. 7: sub r5,r5,r6
  383. srdi r6,r5,7
  384. std r14,STK_REG(R14)(r1)
  385. std r15,STK_REG(R15)(r1)
  386. std r16,STK_REG(R16)(r1)
  387. li r12,64
  388. li r14,80
  389. li r15,96
  390. li r16,112
  391. mtctr r6
  392. /*
  393. * Now do cacheline sized loads and stores. By this stage the
  394. * cacheline stores are also cacheline aligned.
  395. */
  396. .align 5
  397. 8:
  398. err4; lvx vr7,r0,r4
  399. err4; lvx vr6,r4,r9
  400. err4; lvx vr5,r4,r10
  401. err4; lvx vr4,r4,r11
  402. err4; lvx vr3,r4,r12
  403. err4; lvx vr2,r4,r14
  404. err4; lvx vr1,r4,r15
  405. err4; lvx vr0,r4,r16
  406. addi r4,r4,128
  407. err4; stvx vr7,r0,r3
  408. err4; stvx vr6,r3,r9
  409. err4; stvx vr5,r3,r10
  410. err4; stvx vr4,r3,r11
  411. err4; stvx vr3,r3,r12
  412. err4; stvx vr2,r3,r14
  413. err4; stvx vr1,r3,r15
  414. err4; stvx vr0,r3,r16
  415. addi r3,r3,128
  416. bdnz 8b
  417. ld r14,STK_REG(R14)(r1)
  418. ld r15,STK_REG(R15)(r1)
  419. ld r16,STK_REG(R16)(r1)
  420. /* Up to 127B to go */
  421. clrldi r5,r5,(64-7)
  422. srdi r6,r5,4
  423. mtocrf 0x01,r6
  424. bf cr7*4+1,9f
  425. err3; lvx vr3,r0,r4
  426. err3; lvx vr2,r4,r9
  427. err3; lvx vr1,r4,r10
  428. err3; lvx vr0,r4,r11
  429. addi r4,r4,64
  430. err3; stvx vr3,r0,r3
  431. err3; stvx vr2,r3,r9
  432. err3; stvx vr1,r3,r10
  433. err3; stvx vr0,r3,r11
  434. addi r3,r3,64
  435. 9: bf cr7*4+2,10f
  436. err3; lvx vr1,r0,r4
  437. err3; lvx vr0,r4,r9
  438. addi r4,r4,32
  439. err3; stvx vr1,r0,r3
  440. err3; stvx vr0,r3,r9
  441. addi r3,r3,32
  442. 10: bf cr7*4+3,11f
  443. err3; lvx vr1,r0,r4
  444. addi r4,r4,16
  445. err3; stvx vr1,r0,r3
  446. addi r3,r3,16
  447. /* Up to 15B to go */
  448. 11: clrldi r5,r5,(64-4)
  449. mtocrf 0x01,r5
  450. bf cr7*4+0,12f
  451. err3; ld r0,0(r4)
  452. addi r4,r4,8
  453. err3; std r0,0(r3)
  454. addi r3,r3,8
  455. 12: bf cr7*4+1,13f
  456. err3; lwz r0,0(r4)
  457. addi r4,r4,4
  458. err3; stw r0,0(r3)
  459. addi r3,r3,4
  460. 13: bf cr7*4+2,14f
  461. err3; lhz r0,0(r4)
  462. addi r4,r4,2
  463. err3; sth r0,0(r3)
  464. addi r3,r3,2
  465. 14: bf cr7*4+3,15f
  466. err3; lbz r0,0(r4)
  467. err3; stb r0,0(r3)
  468. 15: addi r1,r1,STACKFRAMESIZE
  469. b .exit_vmx_usercopy /* tail call optimise */
  470. .Lvmx_unaligned_copy:
  471. /* Get the destination 16B aligned */
  472. neg r6,r3
  473. mtocrf 0x01,r6
  474. clrldi r6,r6,(64-4)
  475. bf cr7*4+3,1f
  476. err3; lbz r0,0(r4)
  477. addi r4,r4,1
  478. err3; stb r0,0(r3)
  479. addi r3,r3,1
  480. 1: bf cr7*4+2,2f
  481. err3; lhz r0,0(r4)
  482. addi r4,r4,2
  483. err3; sth r0,0(r3)
  484. addi r3,r3,2
  485. 2: bf cr7*4+1,3f
  486. err3; lwz r0,0(r4)
  487. addi r4,r4,4
  488. err3; stw r0,0(r3)
  489. addi r3,r3,4
  490. 3: bf cr7*4+0,4f
  491. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  492. err3; lwz r7,4(r4)
  493. addi r4,r4,8
  494. err3; stw r0,0(r3)
  495. err3; stw r7,4(r3)
  496. addi r3,r3,8
  497. 4: sub r5,r5,r6
  498. /* Get the desination 128B aligned */
  499. neg r6,r3
  500. srdi r7,r6,4
  501. mtocrf 0x01,r7
  502. clrldi r6,r6,(64-7)
  503. li r9,16
  504. li r10,32
  505. li r11,48
  506. lvsl vr16,0,r4 /* Setup permute control vector */
  507. err3; lvx vr0,0,r4
  508. addi r4,r4,16
  509. bf cr7*4+3,5f
  510. err3; lvx vr1,r0,r4
  511. vperm vr8,vr0,vr1,vr16
  512. addi r4,r4,16
  513. err3; stvx vr8,r0,r3
  514. addi r3,r3,16
  515. vor vr0,vr1,vr1
  516. 5: bf cr7*4+2,6f
  517. err3; lvx vr1,r0,r4
  518. vperm vr8,vr0,vr1,vr16
  519. err3; lvx vr0,r4,r9
  520. vperm vr9,vr1,vr0,vr16
  521. addi r4,r4,32
  522. err3; stvx vr8,r0,r3
  523. err3; stvx vr9,r3,r9
  524. addi r3,r3,32
  525. 6: bf cr7*4+1,7f
  526. err3; lvx vr3,r0,r4
  527. vperm vr8,vr0,vr3,vr16
  528. err3; lvx vr2,r4,r9
  529. vperm vr9,vr3,vr2,vr16
  530. err3; lvx vr1,r4,r10
  531. vperm vr10,vr2,vr1,vr16
  532. err3; lvx vr0,r4,r11
  533. vperm vr11,vr1,vr0,vr16
  534. addi r4,r4,64
  535. err3; stvx vr8,r0,r3
  536. err3; stvx vr9,r3,r9
  537. err3; stvx vr10,r3,r10
  538. err3; stvx vr11,r3,r11
  539. addi r3,r3,64
  540. 7: sub r5,r5,r6
  541. srdi r6,r5,7
  542. std r14,STK_REG(R14)(r1)
  543. std r15,STK_REG(R15)(r1)
  544. std r16,STK_REG(R16)(r1)
  545. li r12,64
  546. li r14,80
  547. li r15,96
  548. li r16,112
  549. mtctr r6
  550. /*
  551. * Now do cacheline sized loads and stores. By this stage the
  552. * cacheline stores are also cacheline aligned.
  553. */
  554. .align 5
  555. 8:
  556. err4; lvx vr7,r0,r4
  557. vperm vr8,vr0,vr7,vr16
  558. err4; lvx vr6,r4,r9
  559. vperm vr9,vr7,vr6,vr16
  560. err4; lvx vr5,r4,r10
  561. vperm vr10,vr6,vr5,vr16
  562. err4; lvx vr4,r4,r11
  563. vperm vr11,vr5,vr4,vr16
  564. err4; lvx vr3,r4,r12
  565. vperm vr12,vr4,vr3,vr16
  566. err4; lvx vr2,r4,r14
  567. vperm vr13,vr3,vr2,vr16
  568. err4; lvx vr1,r4,r15
  569. vperm vr14,vr2,vr1,vr16
  570. err4; lvx vr0,r4,r16
  571. vperm vr15,vr1,vr0,vr16
  572. addi r4,r4,128
  573. err4; stvx vr8,r0,r3
  574. err4; stvx vr9,r3,r9
  575. err4; stvx vr10,r3,r10
  576. err4; stvx vr11,r3,r11
  577. err4; stvx vr12,r3,r12
  578. err4; stvx vr13,r3,r14
  579. err4; stvx vr14,r3,r15
  580. err4; stvx vr15,r3,r16
  581. addi r3,r3,128
  582. bdnz 8b
  583. ld r14,STK_REG(R14)(r1)
  584. ld r15,STK_REG(R15)(r1)
  585. ld r16,STK_REG(R16)(r1)
  586. /* Up to 127B to go */
  587. clrldi r5,r5,(64-7)
  588. srdi r6,r5,4
  589. mtocrf 0x01,r6
  590. bf cr7*4+1,9f
  591. err3; lvx vr3,r0,r4
  592. vperm vr8,vr0,vr3,vr16
  593. err3; lvx vr2,r4,r9
  594. vperm vr9,vr3,vr2,vr16
  595. err3; lvx vr1,r4,r10
  596. vperm vr10,vr2,vr1,vr16
  597. err3; lvx vr0,r4,r11
  598. vperm vr11,vr1,vr0,vr16
  599. addi r4,r4,64
  600. err3; stvx vr8,r0,r3
  601. err3; stvx vr9,r3,r9
  602. err3; stvx vr10,r3,r10
  603. err3; stvx vr11,r3,r11
  604. addi r3,r3,64
  605. 9: bf cr7*4+2,10f
  606. err3; lvx vr1,r0,r4
  607. vperm vr8,vr0,vr1,vr16
  608. err3; lvx vr0,r4,r9
  609. vperm vr9,vr1,vr0,vr16
  610. addi r4,r4,32
  611. err3; stvx vr8,r0,r3
  612. err3; stvx vr9,r3,r9
  613. addi r3,r3,32
  614. 10: bf cr7*4+3,11f
  615. err3; lvx vr1,r0,r4
  616. vperm vr8,vr0,vr1,vr16
  617. addi r4,r4,16
  618. err3; stvx vr8,r0,r3
  619. addi r3,r3,16
  620. /* Up to 15B to go */
  621. 11: clrldi r5,r5,(64-4)
  622. addi r4,r4,-16 /* Unwind the +16 load offset */
  623. mtocrf 0x01,r5
  624. bf cr7*4+0,12f
  625. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  626. err3; lwz r6,4(r4)
  627. addi r4,r4,8
  628. err3; stw r0,0(r3)
  629. err3; stw r6,4(r3)
  630. addi r3,r3,8
  631. 12: bf cr7*4+1,13f
  632. err3; lwz r0,0(r4)
  633. addi r4,r4,4
  634. err3; stw r0,0(r3)
  635. addi r3,r3,4
  636. 13: bf cr7*4+2,14f
  637. err3; lhz r0,0(r4)
  638. addi r4,r4,2
  639. err3; sth r0,0(r3)
  640. addi r3,r3,2
  641. 14: bf cr7*4+3,15f
  642. err3; lbz r0,0(r4)
  643. err3; stb r0,0(r3)
  644. 15: addi r1,r1,STACKFRAMESIZE
  645. b .exit_vmx_usercopy /* tail call optimise */
  646. #endif /* CONFiG_ALTIVEC */