vector.S 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #include <asm/ppc_asm.h>
  2. #include <asm/processor.h>
  3. /*
  4. * The routines below are in assembler so we can closely control the
  5. * usage of floating-point registers. These routines must be called
  6. * with preempt disabled.
  7. */
  8. .section ".toc","aw"
  9. fpzero:
  10. .tc FD_0_0[TC],0
  11. fpone:
  12. .tc FD_3ff00000_0[TC],0x3ff0000000000000 /* 1.0 */
  13. fphalf:
  14. .tc FD_3fe00000_0[TC],0x3fe0000000000000 /* 0.5 */
  15. .text
  16. /*
  17. * Internal routine to enable floating point and set FPSCR to 0.
  18. * Don't call it from C; it doesn't use the normal calling convention.
  19. */
  20. fpenable:
  21. mfmsr r10
  22. ori r11,r10,MSR_FP
  23. mtmsr r11
  24. isync
  25. stfd fr31,-8(r1)
  26. stfd fr0,-16(r1)
  27. stfd fr1,-24(r1)
  28. mffs fr31
  29. lfd fr1,fpzero@toc(r2)
  30. mtfsf 0xff,fr1
  31. blr
  32. fpdisable:
  33. mtlr r12
  34. mtfsf 0xff,fr31
  35. lfd fr1,-24(r1)
  36. lfd fr0,-16(r1)
  37. lfd fr31,-8(r1)
  38. mtmsr r10
  39. isync
  40. blr
  41. /*
  42. * Vector add, floating point.
  43. */
  44. _GLOBAL(vaddfp)
  45. mflr r12
  46. bl fpenable
  47. li r0,4
  48. mtctr r0
  49. li r6,0
  50. 1: lfsx fr0,r4,r6
  51. lfsx fr1,r5,r6
  52. fadds fr0,fr0,fr1
  53. stfsx fr0,r3,r6
  54. addi r6,r6,4
  55. bdnz 1b
  56. b fpdisable
  57. /*
  58. * Vector subtract, floating point.
  59. */
  60. _GLOBAL(vsubfp)
  61. mflr r12
  62. bl fpenable
  63. li r0,4
  64. mtctr r0
  65. li r6,0
  66. 1: lfsx fr0,r4,r6
  67. lfsx fr1,r5,r6
  68. fsubs fr0,fr0,fr1
  69. stfsx fr0,r3,r6
  70. addi r6,r6,4
  71. bdnz 1b
  72. b fpdisable
  73. /*
  74. * Vector multiply and add, floating point.
  75. */
  76. _GLOBAL(vmaddfp)
  77. mflr r12
  78. bl fpenable
  79. stfd fr2,-32(r1)
  80. li r0,4
  81. mtctr r0
  82. li r7,0
  83. 1: lfsx fr0,r4,r7
  84. lfsx fr1,r5,r7
  85. lfsx fr2,r6,r7
  86. fmadds fr0,fr0,fr2,fr1
  87. stfsx fr0,r3,r7
  88. addi r7,r7,4
  89. bdnz 1b
  90. lfd fr2,-32(r1)
  91. b fpdisable
  92. /*
  93. * Vector negative multiply and subtract, floating point.
  94. */
  95. _GLOBAL(vnmsubfp)
  96. mflr r12
  97. bl fpenable
  98. stfd fr2,-32(r1)
  99. li r0,4
  100. mtctr r0
  101. li r7,0
  102. 1: lfsx fr0,r4,r7
  103. lfsx fr1,r5,r7
  104. lfsx fr2,r6,r7
  105. fnmsubs fr0,fr0,fr2,fr1
  106. stfsx fr0,r3,r7
  107. addi r7,r7,4
  108. bdnz 1b
  109. lfd fr2,-32(r1)
  110. b fpdisable
  111. /*
  112. * Vector reciprocal estimate. We just compute 1.0/x.
  113. * r3 -> destination, r4 -> source.
  114. */
  115. _GLOBAL(vrefp)
  116. mflr r12
  117. bl fpenable
  118. li r0,4
  119. lfd fr1,fpone@toc(r2)
  120. mtctr r0
  121. li r6,0
  122. 1: lfsx fr0,r4,r6
  123. fdivs fr0,fr1,fr0
  124. stfsx fr0,r3,r6
  125. addi r6,r6,4
  126. bdnz 1b
  127. b fpdisable
  128. /*
  129. * Vector reciprocal square-root estimate, floating point.
  130. * We use the frsqrte instruction for the initial estimate followed
  131. * by 2 iterations of Newton-Raphson to get sufficient accuracy.
  132. * r3 -> destination, r4 -> source.
  133. */
  134. _GLOBAL(vrsqrtefp)
  135. mflr r12
  136. bl fpenable
  137. stfd fr2,-32(r1)
  138. stfd fr3,-40(r1)
  139. stfd fr4,-48(r1)
  140. stfd fr5,-56(r1)
  141. li r0,4
  142. lfd fr4,fpone@toc(r2)
  143. lfd fr5,fphalf@toc(r2)
  144. mtctr r0
  145. li r6,0
  146. 1: lfsx fr0,r4,r6
  147. frsqrte fr1,fr0 /* r = frsqrte(s) */
  148. fmuls fr3,fr1,fr0 /* r * s */
  149. fmuls fr2,fr1,fr5 /* r * 0.5 */
  150. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  151. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  152. fmuls fr3,fr1,fr0 /* r * s */
  153. fmuls fr2,fr1,fr5 /* r * 0.5 */
  154. fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */
  155. fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */
  156. stfsx fr1,r3,r6
  157. addi r6,r6,4
  158. bdnz 1b
  159. lfd fr5,-56(r1)
  160. lfd fr4,-48(r1)
  161. lfd fr3,-40(r1)
  162. lfd fr2,-32(r1)
  163. b fpdisable