bte_error.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved.
  7. */
  8. #include <linux/types.h>
  9. #include <asm/sn/sn_sal.h>
  10. #include "ioerror.h"
  11. #include <asm/sn/addrs.h>
  12. #include <asm/sn/shubio.h>
  13. #include <asm/sn/geo.h>
  14. #include "xtalk/xwidgetdev.h"
  15. #include "xtalk/hubdev.h"
  16. #include <asm/sn/bte.h>
  17. #include <asm/param.h>
  18. /*
  19. * Bte error handling is done in two parts. The first captures
  20. * any crb related errors. Since there can be multiple crbs per
  21. * interface and multiple interfaces active, we need to wait until
  22. * all active crbs are completed. This is the first job of the
  23. * second part error handler. When all bte related CRBs are cleanly
  24. * completed, it resets the interfaces and gets them ready for new
  25. * transfers to be queued.
  26. */
  27. void bte_error_handler(unsigned long);
  28. /*
  29. * Wait until all BTE related CRBs are completed
  30. * and then reset the interfaces.
  31. */
  32. void shub1_bte_error_handler(unsigned long _nodepda)
  33. {
  34. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  35. struct timer_list *recovery_timer = &err_nodepda->bte_recovery_timer;
  36. nasid_t nasid;
  37. int i;
  38. int valid_crbs;
  39. ii_imem_u_t imem; /* II IMEM Register */
  40. ii_icrb0_d_u_t icrbd; /* II CRB Register D */
  41. ii_ibcr_u_t ibcr;
  42. ii_icmr_u_t icmr;
  43. ii_ieclr_u_t ieclr;
  44. BTE_PRINTK(("shub1_bte_error_handler(%p) - %d\n", err_nodepda,
  45. smp_processor_id()));
  46. if ((err_nodepda->bte_if[0].bh_error == BTE_SUCCESS) &&
  47. (err_nodepda->bte_if[1].bh_error == BTE_SUCCESS)) {
  48. BTE_PRINTK(("eh:%p:%d Nothing to do.\n", err_nodepda,
  49. smp_processor_id()));
  50. return;
  51. }
  52. /* Determine information about our hub */
  53. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  54. /*
  55. * A BTE transfer can use multiple CRBs. We need to make sure
  56. * that all the BTE CRBs are complete (or timed out) before
  57. * attempting to clean up the error. Resetting the BTE while
  58. * there are still BTE CRBs active will hang the BTE.
  59. * We should look at all the CRBs to see if they are allocated
  60. * to the BTE and see if they are still active. When none
  61. * are active, we can continue with the cleanup.
  62. *
  63. * We also want to make sure that the local NI port is up.
  64. * When a router resets the NI port can go down, while it
  65. * goes through the LLP handshake, but then comes back up.
  66. */
  67. icmr.ii_icmr_regval = REMOTE_HUB_L(nasid, IIO_ICMR);
  68. if (icmr.ii_icmr_fld_s.i_crb_mark != 0) {
  69. /*
  70. * There are errors which still need to be cleaned up by
  71. * hubiio_crb_error_handler
  72. */
  73. mod_timer(recovery_timer, HZ * 5);
  74. BTE_PRINTK(("eh:%p:%d Marked Giving up\n", err_nodepda,
  75. smp_processor_id()));
  76. return;
  77. }
  78. if (icmr.ii_icmr_fld_s.i_crb_vld != 0) {
  79. valid_crbs = icmr.ii_icmr_fld_s.i_crb_vld;
  80. for (i = 0; i < IIO_NUM_CRBS; i++) {
  81. if (!((1 << i) & valid_crbs)) {
  82. /* This crb was not marked as valid, ignore */
  83. continue;
  84. }
  85. icrbd.ii_icrb0_d_regval =
  86. REMOTE_HUB_L(nasid, IIO_ICRB_D(i));
  87. if (icrbd.d_bteop) {
  88. mod_timer(recovery_timer, HZ * 5);
  89. BTE_PRINTK(("eh:%p:%d Valid %d, Giving up\n",
  90. err_nodepda, smp_processor_id(),
  91. i));
  92. return;
  93. }
  94. }
  95. }
  96. BTE_PRINTK(("eh:%p:%d Cleaning up\n", err_nodepda, smp_processor_id()));
  97. /* Reenable both bte interfaces */
  98. imem.ii_imem_regval = REMOTE_HUB_L(nasid, IIO_IMEM);
  99. imem.ii_imem_fld_s.i_b0_esd = imem.ii_imem_fld_s.i_b1_esd = 1;
  100. REMOTE_HUB_S(nasid, IIO_IMEM, imem.ii_imem_regval);
  101. /* Clear BTE0/1 error bits */
  102. ieclr.ii_ieclr_regval = 0;
  103. if (err_nodepda->bte_if[0].bh_error != BTE_SUCCESS)
  104. ieclr.ii_ieclr_fld_s.i_e_bte_0 = 1;
  105. if (err_nodepda->bte_if[1].bh_error != BTE_SUCCESS)
  106. ieclr.ii_ieclr_fld_s.i_e_bte_1 = 1;
  107. REMOTE_HUB_S(nasid, IIO_IECLR, ieclr.ii_ieclr_regval);
  108. /* Reinitialize both BTE state machines. */
  109. ibcr.ii_ibcr_regval = REMOTE_HUB_L(nasid, IIO_IBCR);
  110. ibcr.ii_ibcr_fld_s.i_soft_reset = 1;
  111. REMOTE_HUB_S(nasid, IIO_IBCR, ibcr.ii_ibcr_regval);
  112. del_timer(recovery_timer);
  113. }
  114. /*
  115. * Wait until all BTE related CRBs are completed
  116. * and then reset the interfaces.
  117. */
  118. void bte_error_handler(unsigned long _nodepda)
  119. {
  120. struct nodepda_s *err_nodepda = (struct nodepda_s *)_nodepda;
  121. spinlock_t *recovery_lock = &err_nodepda->bte_recovery_lock;
  122. int i;
  123. nasid_t nasid;
  124. unsigned long irq_flags;
  125. volatile u64 *notify;
  126. bte_result_t bh_error;
  127. BTE_PRINTK(("bte_error_handler(%p) - %d\n", err_nodepda,
  128. smp_processor_id()));
  129. spin_lock_irqsave(recovery_lock, irq_flags);
  130. /*
  131. * Lock all interfaces on this node to prevent new transfers
  132. * from being queued.
  133. */
  134. for (i = 0; i < BTES_PER_NODE; i++) {
  135. if (err_nodepda->bte_if[i].cleanup_active) {
  136. continue;
  137. }
  138. spin_lock(&err_nodepda->bte_if[i].spinlock);
  139. BTE_PRINTK(("eh:%p:%d locked %d\n", err_nodepda,
  140. smp_processor_id(), i));
  141. err_nodepda->bte_if[i].cleanup_active = 1;
  142. }
  143. if (is_shub1()) {
  144. shub1_bte_error_handler(_nodepda);
  145. } else {
  146. nasid = cnodeid_to_nasid(err_nodepda->bte_if[0].bte_cnode);
  147. if (ia64_sn_bte_recovery(nasid))
  148. panic("bte_error_handler(): Fatal BTE Error");
  149. }
  150. for (i = 0; i < BTES_PER_NODE; i++) {
  151. bh_error = err_nodepda->bte_if[i].bh_error;
  152. if (bh_error != BTE_SUCCESS) {
  153. /* There is an error which needs to be notified */
  154. notify = err_nodepda->bte_if[i].most_rcnt_na;
  155. BTE_PRINTK(("cnode %d bte %d error=0x%lx\n",
  156. err_nodepda->bte_if[i].bte_cnode,
  157. err_nodepda->bte_if[i].bte_num,
  158. IBLS_ERROR | (u64) bh_error));
  159. *notify = IBLS_ERROR | bh_error;
  160. err_nodepda->bte_if[i].bh_error = BTE_SUCCESS;
  161. }
  162. err_nodepda->bte_if[i].cleanup_active = 0;
  163. BTE_PRINTK(("eh:%p:%d Unlocked %d\n", err_nodepda,
  164. smp_processor_id(), i));
  165. spin_unlock(&err_nodepda->bte_if[i].spinlock);
  166. }
  167. spin_unlock_irqrestore(recovery_lock, irq_flags);
  168. }
  169. /*
  170. * First part error handler. This is called whenever any error CRB interrupt
  171. * is generated by the II.
  172. */
  173. void
  174. bte_crb_error_handler(cnodeid_t cnode, int btenum,
  175. int crbnum, ioerror_t * ioe, int bteop)
  176. {
  177. struct bteinfo_s *bte;
  178. bte = &(NODEPDA(cnode)->bte_if[btenum]);
  179. /*
  180. * The caller has already figured out the error type, we save that
  181. * in the bte handle structure for the thread excercising the
  182. * interface to consume.
  183. */
  184. bte->bh_error = ioe->ie_errortype + BTEFAIL_OFFSET;
  185. bte->bte_error_count++;
  186. BTE_PRINTK(("Got an error on cnode %d bte %d: HW error type 0x%x\n",
  187. bte->bte_cnode, bte->bte_num, ioe->ie_errortype));
  188. bte_error_handler((unsigned long) NODEPDA(cnode));
  189. }