vector.S 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #include <asm/ppc_asm.h>
  2. #include <asm/processor.h>
  3. /*
  4. * The routines below are in assembler so we can closely control the
  5. * usage of floating-point registers. These routines must be called
  6. * with preempt disabled.
  7. */
  8. .data
  9. fpzero:
  10. .long 0
  11. fpone:
  12. .long 0x3f800000 /* 1.0 in single-precision FP */
  13. fphalf:
  14. .long 0x3f000000 /* 0.5 in single-precision FP */
  15. .text
  16. /*
  17. * Internal routine to enable floating point and set FPSCR to 0.
  18. * Don't call it from C; it doesn't use the normal calling convention.
  19. */
  20. fpenable:
  21. mfmsr r10
  22. ori r11,r10,MSR_FP
  23. mtmsr r11
  24. isync
  25. stfd fr0,24(r1)
  26. stfd fr1,16(r1)
  27. stfd fr31,8(r1)
  28. lis r11,fpzero@ha
  29. mffs fr31
  30. lfs fr1,fpzero@l(r11)
  31. mtfsf 0xff,fr1
  32. blr
  33. fpdisable:
  34. mtfsf 0xff,fr31
  35. lfd fr31,8(r1)
  36. lfd fr1,16(r1)
  37. lfd fr0,24(r1)
  38. mtmsr r10
  39. isync
  40. blr
  41. /*
  42. * Vector add, floating point.
  43. */
  44. .globl vaddfp
  45. vaddfp:
  46. stwu r1,-32(r1)
  47. mflr r0
  48. stw r0,36(r1)
  49. bl fpenable
  50. li r0,4
  51. mtctr r0
  52. li r6,0
  53. 1: lfsx fr0,r4,r6
  54. lfsx fr1,r5,r6
  55. fadds fr0,fr0,fr1
  56. stfsx fr0,r3,r6
  57. addi r6,r6,4
  58. bdnz 1b
  59. bl fpdisable
  60. lwz r0,36(r1)
  61. mtlr r0
  62. addi r1,r1,32
  63. blr
  64. /*
  65. * Vector subtract, floating point.
  66. */
  67. .globl vsubfp
  68. vsubfp:
  69. stwu r1,-32(r1)
  70. mflr r0
  71. stw r0,36(r1)
  72. bl fpenable
  73. li r0,4
  74. mtctr r0
  75. li r6,0
  76. 1: lfsx fr0,r4,r6
  77. lfsx fr1,r5,r6
  78. fsubs fr0,fr0,fr1
  79. stfsx fr0,r3,r6
  80. addi r6,r6,4
  81. bdnz 1b
  82. bl fpdisable
  83. lwz r0,36(r1)
  84. mtlr r0
  85. addi r1,r1,32
  86. blr
  87. /*
  88. * Vector multiply and add, floating point.
  89. */
  90. .globl vmaddfp
  91. vmaddfp:
  92. stwu r1,-48(r1)
  93. mflr r0
  94. stw r0,52(r1)
  95. bl fpenable
  96. stfd fr2,32(r1)
  97. li r0,4
  98. mtctr r0
  99. li r7,0
  100. 1: lfsx fr0,r4,r7
  101. lfsx fr1,r5,r7
  102. lfsx fr2,r6,r7
  103. fmadds fr0,fr0,fr2,fr1
  104. stfsx fr0,r3,r7
  105. addi r7,r7,4
  106. bdnz 1b
  107. lfd fr2,32(r1)
  108. bl fpdisable
  109. lwz r0,52(r1)
  110. mtlr r0
  111. addi r1,r1,48
  112. blr
  113. /*
  114. * Vector negative multiply and subtract, floating point.
  115. */
  116. .globl vnmsubfp
  117. vnmsubfp:
  118. stwu r1,-48(r1)
  119. mflr r0
  120. stw r0,52(r1)
  121. bl fpenable
  122. stfd fr2,32(r1)
  123. li r0,4
  124. mtctr r0
  125. li r7,0
  126. 1: lfsx fr0,r4,r7
  127. lfsx fr1,r5,r7
  128. lfsx fr2,r6,r7
  129. fnmsubs fr0,fr0,fr2,fr1
  130. stfsx fr0,r3,r7
  131. addi r7,r7,4
  132. bdnz 1b
  133. lfd fr2,32(r1)
  134. bl fpdisable
  135. lwz r0,52(r1)
  136. mtlr r0
  137. addi r1,r1,48
  138. blr
  139. /*
  140. * Vector reciprocal estimate. We just compute 1.0/x.
  141. * r3 -> destination, r4 -> source.
  142. */
  143. .globl vrefp
  144. vrefp:
  145. stwu r1,-32(r1)
  146. mflr r0
  147. stw r0,36(r1)
  148. bl fpenable
  149. lis r9,fpone@ha
  150. li r0,4
  151. lfs fr1,fpone@l(r9)
  152. mtctr r0
  153. li r6,0
  154. 1: lfsx fr0,r4,r6
  155. fdivs fr0,fr1,fr0
  156. stfsx fr0,r3,r6
  157. addi r6,r6,4
  158. bdnz 1b
  159. bl fpdisable
  160. lwz r0,36(r1)
  161. mtlr r0
  162. addi r1,r1,32
  163. blr
  164. /*
  165. * Vector reciprocal square-root estimate, floating point.
  166. * We use the frsqrte instruction for the initial estimate followed
  167. * by 2 iterations of Newton-Raphson to get sufficient accuracy.
  168. * r3 -> destination, r4 -> source.
  169. */
  170. .globl vrsqrtefp
  171. vrsqrtefp:
  172. stwu r1,-48(r1)
  173. mflr r0
  174. stw r0,52(r1)
  175. bl fpenable
  176. stfd fr2,32(r1)
  177. stfd fr3,40(r1)
  178. stfd fr4,48(r1)
  179. stfd fr5,56(r1)
  180. lis r9,fpone@ha
  181. lis r8,fphalf@ha
  182. li r0,4
  183. lfs fr4,fpone@l(r9)
  184. lfs fr5,fphalf@l(r8)
  185. mtctr r0
  186. li r6,0
  187. 1: lfsx fr0,r4,r6
  188. frsqrte fr1,fr0 /* r = frsqrte(s) */
  189. fmuls fr3,fr1,fr0 /* r * s */
  190. fmuls fr2,fr1,fr5 /* r * 0.5 */
  191. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  192. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  193. fmuls fr3,fr1,fr0 /* r * s */
  194. fmuls fr2,fr1,fr5 /* r * 0.5 */
  195. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  196. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  197. stfsx fr1,r3,r6
  198. addi r6,r6,4
  199. bdnz 1b
  200. lfd fr5,56(r1)
  201. lfd fr4,48(r1)
  202. lfd fr3,40(r1)
  203. lfd fr2,32(r1)
  204. bl fpdisable
  205. lwz r0,36(r1)
  206. mtlr r0
  207. addi r1,r1,32
  208. blr