bte_error.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
  7. */
  8. #include <linux/types.h>
  9. #include <asm/sn/sn_sal.h>
  10. #include "ioerror.h"
  11. #include <asm/sn/addrs.h>
  12. #include <asm/sn/shubio.h>
  13. #include <asm/sn/geo.h>
  14. #include "xtalk/xwidgetdev.h"
  15. #include "xtalk/hubdev.h"
  16. #include <asm/sn/bte.h>
  17. #include <asm/param.h>
  18. /*
  19. * Bte error handling is done in two parts. The first captures
  20. * any crb related errors. Since there can be multiple crbs per
  21. * interface and multiple interfaces active, we need to wait until
  22. * all active crbs are completed. This is the first job of the
  23. * second part error handler. When all bte related CRBs are cleanly
  24. * completed, it resets the interfaces and gets them ready for new
  25. * transfers to be queued.
  26. */
  27. void bte_error_handler(unsigned long);
  28. /*
  29. * Wait until all BTE related CRBs are completed
  30. * and then reset the interfaces.
  31. */
  32. int shub1_bte_error_handler(unsigned long _nodepda)
  33. {
  34. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  35. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  36. nasid_t nasid;
  37. int i;
  38. int valid_crbs;
  39. ii_imem_u_t imem; /* II IMEM Register */
  40. ii_icrb0_d_u_t icrbd; /* II CRB Register D */
  41. ii_ibcr_u_t ibcr;
  42. ii_icmr_u_t icmr;
  43. ii_ieclr_u_t ieclr;
  44. BTE_PRINTK(("shub1_bte_error_handler(%p) - %d\n", err_nodepda,
  45. smp_processor_id()));
  46. if ((err_nodepda->bte_if[0].bh_error == BTE_SUCCESS) &&
  47. (err_nodepda->bte_if[1].bh_error == BTE_SUCCESS)) {
  48. BTE_PRINTK(("eh:%p:%d Nothing to do.\n", err_nodepda,
  49. smp_processor_id()));
  50. return 1;
  51. }
  52. /* Determine information about our hub */
  53. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  54. /*
  55. * A BTE transfer can use multiple CRBs. We need to make sure
  56. * that all the BTE CRBs are complete (or timed out) before
  57. * attempting to clean up the error. Resetting the BTE while
  58. * there are still BTE CRBs active will hang the BTE.
  59. * We should look at all the CRBs to see if they are allocated
  60. * to the BTE and see if they are still active. When none
  61. * are active, we can continue with the cleanup.
  62. *
  63. * We also want to make sure that the local NI port is up.
  64. * When a router resets the NI port can go down, while it
  65. * goes through the LLP handshake, but then comes back up.
  66. */
  67. icmr.ii_icmr_regval = REMOTE_HUB_L(nasid, IIO_ICMR);
  68. if (icmr.ii_icmr_fld_s.i_crb_mark != 0) {
  69. /*
  70. * There are errors which still need to be cleaned up by
  71. * hubiio_crb_error_handler
  72. */
  73. mod_timer(recovery_timer, HZ * 5);
  74. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  75. smp_processor_id()));
  76. return 1;
  77. }
  78. if (icmr.ii_icmr_fld_s.i_crb_vld != 0) {
  79. valid_crbs = icmr.ii_icmr_fld_s.i_crb_vld;
  80. for (i = 0; i < IIO_NUM_CRBS; i++) {
  81. if (!((1 << i) & valid_crbs)) {
  82. /* This crb was not marked as valid, ignore */
  83. continue;
  84. }
  85. icrbd.ii_icrb0_d_regval =
  86. REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
  87. if (icrbd.d_bteop) {
  88. mod_timer(recovery_timer, HZ * 5);
  89. BTE_PRINTK(("eh:%p:%d Valid %d, Giving up\n",
  90. err_nodepda, smp_processor_id(),
  91. i));
  92. return 1;
  93. }
  94. }
  95. }
  96. BTE_PRINTK(("eh:%p:%d Cleaning up\n", err_nodepda, smp_processor_id()));
  97. /* Reenable both bte interfaces */
  98. imem.ii_imem_regval = REMOTE_HUB_L(nasid, IIO_IMEM);
  99. imem.ii_imem_fld_s.i_b0_esd = imem.ii_imem_fld_s.i_b1_esd = 1;
  100. REMOTE_HUB_S(nasid, IIO_IMEM, imem.ii_imem_regval);
  101. /* Clear BTE0/1 error bits */
  102. ieclr.ii_ieclr_regval = 0;
  103. if (err_nodepda->bte_if[0].bh_error != BTE_SUCCESS)
  104. ieclr.ii_ieclr_fld_s.i_e_bte_0 = 1;
  105. if (err_nodepda->bte_if[1].bh_error != BTE_SUCCESS)
  106. ieclr.ii_ieclr_fld_s.i_e_bte_1 = 1;
  107. REMOTE_HUB_S(nasid, IIO_IECLR, ieclr.ii_ieclr_regval);
  108. /* Reinitialize both BTE state machines. */
  109. ibcr.ii_ibcr_regval = REMOTE_HUB_L(nasid, IIO_IBCR);
  110. ibcr.ii_ibcr_fld_s.i_soft_reset = 1;
  111. REMOTE_HUB_S(nasid, IIO_IBCR, ibcr.ii_ibcr_regval);
  112. del_timer(recovery_timer);
  113. return 0;
  114. }
  115. /*
  116. * Wait until all BTE related CRBs are completed
  117. * and then reset the interfaces.
  118. */
  119. int shub2_bte_error_handler(unsigned long _nodepda)
  120. {
  121. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  122. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  123. struct bteinfo_s *bte;
  124. nasid_t nasid;
  125. u64 status;
  126. int i;
  127. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  128. /*
  129. * Verify that all the BTEs are complete
  130. */
  131. for (i = 0; i < BTES_PER_NODE; i++) {
  132. bte = &err_nodepda->bte_if[i];
  133. status = BTE_LNSTAT_LOAD(bte);
  134. if ((status & IBLS_ERROR) || !(status & IBLS_BUSY))
  135. continue;
  136. mod_timer(recovery_timer, HZ * 5);
  137. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  138. smp_processor_id()));
  139. return 1;
  140. }
  141. if (ia64_sn_bte_recovery(nasid))
  142. panic("bte_error_handler(): Fatal BTE Error");
  143. del_timer(recovery_timer);
  144. return 0;
  145. }
  146. /*
  147. * Wait until all BTE related CRBs are completed
  148. * and then reset the interfaces.
  149. */
  150. void bte_error_handler(unsigned long _nodepda)
  151. {
  152. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  153. spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock;
  154. int i;
  155. unsigned long irq_flags;
  156. volatile u64 *notify;
  157. bte_result_t bh_error;
  158. BTE_PRINTK(("bte_error_handler(%p) - %d\n", err_nodepda,
  159. smp_processor_id()));
  160. spin_lock_irqsave(recovery_lock, irq_flags);
  161. /*
  162. * Lock all interfaces on this node to prevent new transfers
  163. * from being queued.
  164. */
  165. for (i = 0; i < BTES_PER_NODE; i++) {
  166. if (err_nodepda->bte_if[i].cleanup_active) {
  167. continue;
  168. }
  169. spin_lock(&err_nodepda->bte_if[i].spinlock);
  170. BTE_PRINTK(("eh:%p:%d locked %d\n", err_nodepda,
  171. smp_processor_id(), i));
  172. err_nodepda->bte_if[i].cleanup_active = 1;
  173. }
  174. if (is_shub1()) {
  175. if (shub1_bte_error_handler(_nodepda)) {
  176. spin_unlock_irqrestore(recovery_lock, irq_flags);
  177. return;
  178. }
  179. } else {
  180. if (shub2_bte_error_handler(_nodepda)) {
  181. spin_unlock_irqrestore(recovery_lock, irq_flags);
  182. return;
  183. }
  184. }
  185. for (i = 0; i < BTES_PER_NODE; i++) {
  186. bh_error = err_nodepda->bte_if[i].bh_error;
  187. if (bh_error != BTE_SUCCESS) {
  188. /* There is an error which needs to be notified */
  189. notify = err_nodepda->bte_if[i].most_rcnt_na;
  190. BTE_PRINTK(("cnode %d bte %d error=0x%lx\n",
  191. err_nodepda->bte_if[i].bte_cnode,
  192. err_nodepda->bte_if[i].bte_num,
  193. IBLS_ERROR | (u64) bh_error));
  194. *notify = IBLS_ERROR | bh_error;
  195. err_nodepda->bte_if[i].bh_error = BTE_SUCCESS;
  196. }
  197. err_nodepda->bte_if[i].cleanup_active = 0;
  198. BTE_PRINTK(("eh:%p:%d Unlocked %d\n", err_nodepda,
  199. smp_processor_id(), i));
  200. spin_unlock(&err_nodepda->bte_if[i].spinlock);
  201. }
  202. spin_unlock_irqrestore(recovery_lock, irq_flags);
  203. }
  204. /*
  205. * First part error handler. This is called whenever any error CRB interrupt
  206. * is generated by the II.
  207. */
  208. void
  209. bte_crb_error_handler(cnodeid_t cnode, int btenum,
  210. int crbnum, ioerror_t * ioe, int bteop)
  211. {
  212. struct bteinfo_s *bte;
  213. bte = &(NODEPDA(cnode)->bte_if[btenum]);
  214. /*
  215. * The caller has already figured out the error type, we save that
  216. * in the bte handle structure for the thread excercising the
  217. * interface to consume.
  218. */
  219. bte->bh_error = ioe->ie_errortype + BTEFAIL_OFFSET;
  220. bte->bte_error_count++;
  221. BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n",
  222. bte->bte_cnode, bte->bte_num, ioe->ie_errortype));
  223. bte_error_handler((unsigned long) NODEPDA(cnode));
  224. }