eeh.c 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100
  1. /*
  2. * eeh.c
  3. * Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include <linux/delay.h>
  20. #include <linux/init.h>
  21. #include <linux/list.h>
  22. #include <linux/pci.h>
  23. #include <linux/proc_fs.h>
  24. #include <linux/rbtree.h>
  25. #include <linux/seq_file.h>
  26. #include <linux/spinlock.h>
  27. #include <asm/atomic.h>
  28. #include <asm/eeh.h>
  29. #include <asm/eeh_event.h>
  30. #include <asm/io.h>
  31. #include <asm/machdep.h>
  32. #include <asm/ppc-pci.h>
  33. #include <asm/rtas.h>
  34. #undef DEBUG
  35. /** Overview:
  36. * EEH, or "Extended Error Handling" is a PCI bridge technology for
  37. * dealing with PCI bus errors that can't be dealt with within the
  38. * usual PCI framework, except by check-stopping the CPU. Systems
  39. * that are designed for high-availability/reliability cannot afford
  40. * to crash due to a "mere" PCI error, thus the need for EEH.
  41. * An EEH-capable bridge operates by converting a detected error
  42. * into a "slot freeze", taking the PCI adapter off-line, making
  43. * the slot behave, from the OS'es point of view, as if the slot
  44. * were "empty": all reads return 0xff's and all writes are silently
  45. * ignored. EEH slot isolation events can be triggered by parity
  46. * errors on the address or data busses (e.g. during posted writes),
  47. * which in turn might be caused by low voltage on the bus, dust,
  48. * vibration, humidity, radioactivity or plain-old failed hardware.
  49. *
  50. * Note, however, that one of the leading causes of EEH slot
  51. * freeze events are buggy device drivers, buggy device microcode,
  52. * or buggy device hardware. This is because any attempt by the
  53. * device to bus-master data to a memory address that is not
  54. * assigned to the device will trigger a slot freeze. (The idea
  55. * is to prevent devices-gone-wild from corrupting system memory).
  56. * Buggy hardware/drivers will have a miserable time co-existing
  57. * with EEH.
  58. *
  59. * Ideally, a PCI device driver, when suspecting that an isolation
  60. * event has occured (e.g. by reading 0xff's), will then ask EEH
  61. * whether this is the case, and then take appropriate steps to
  62. * reset the PCI slot, the PCI device, and then resume operations.
  63. * However, until that day, the checking is done here, with the
  64. * eeh_check_failure() routine embedded in the MMIO macros. If
  65. * the slot is found to be isolated, an "EEH Event" is synthesized
  66. * and sent out for processing.
  67. */
  68. /* If a device driver keeps reading an MMIO register in an interrupt
  69. * handler after a slot isolation event has occurred, we assume it
  70. * is broken and panic. This sets the threshold for how many read
  71. * attempts we allow before panicking.
  72. */
  73. #define EEH_MAX_FAILS 100000
  74. /* RTAS tokens */
  75. static int ibm_set_eeh_option;
  76. static int ibm_set_slot_reset;
  77. static int ibm_read_slot_reset_state;
  78. static int ibm_read_slot_reset_state2;
  79. static int ibm_slot_error_detail;
  80. static int ibm_get_config_addr_info;
  81. static int ibm_configure_bridge;
  82. int eeh_subsystem_enabled;
  83. EXPORT_SYMBOL(eeh_subsystem_enabled);
  84. /* Lock to avoid races due to multiple reports of an error */
  85. static DEFINE_SPINLOCK(confirm_error_lock);
  86. /* Buffer for reporting slot-error-detail rtas calls */
  87. static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
  88. static DEFINE_SPINLOCK(slot_errbuf_lock);
  89. static int eeh_error_buf_size;
  90. /* System monitoring statistics */
  91. static unsigned long no_device;
  92. static unsigned long no_dn;
  93. static unsigned long no_cfg_addr;
  94. static unsigned long ignored_check;
  95. static unsigned long total_mmio_ffs;
  96. static unsigned long false_positives;
  97. static unsigned long ignored_failures;
  98. static unsigned long slot_resets;
  99. #define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
  100. /* --------------------------------------------------------------- */
  101. /* Below lies the EEH event infrastructure */
  102. void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
  103. {
  104. int config_addr;
  105. unsigned long flags;
  106. int rc;
  107. /* Log the error with the rtas logger */
  108. spin_lock_irqsave(&slot_errbuf_lock, flags);
  109. memset(slot_errbuf, 0, eeh_error_buf_size);
  110. /* Use PE configuration address, if present */
  111. config_addr = pdn->eeh_config_addr;
  112. if (pdn->eeh_pe_config_addr)
  113. config_addr = pdn->eeh_pe_config_addr;
  114. rc = rtas_call(ibm_slot_error_detail,
  115. 8, 1, NULL, config_addr,
  116. BUID_HI(pdn->phb->buid),
  117. BUID_LO(pdn->phb->buid), NULL, 0,
  118. virt_to_phys(slot_errbuf),
  119. eeh_error_buf_size,
  120. severity);
  121. if (rc == 0)
  122. log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
  123. spin_unlock_irqrestore(&slot_errbuf_lock, flags);
  124. }
  125. /**
  126. * read_slot_reset_state - Read the reset state of a device node's slot
  127. * @dn: device node to read
  128. * @rets: array to return results in
  129. */
  130. static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
  131. {
  132. int token, outputs;
  133. int config_addr;
  134. if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
  135. token = ibm_read_slot_reset_state2;
  136. outputs = 4;
  137. } else {
  138. token = ibm_read_slot_reset_state;
  139. rets[2] = 0; /* fake PE Unavailable info */
  140. outputs = 3;
  141. }
  142. /* Use PE configuration address, if present */
  143. config_addr = pdn->eeh_config_addr;
  144. if (pdn->eeh_pe_config_addr)
  145. config_addr = pdn->eeh_pe_config_addr;
  146. return rtas_call(token, 3, outputs, rets, config_addr,
  147. BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid));
  148. }
  149. /**
  150. * eeh_token_to_phys - convert EEH address token to phys address
  151. * @token i/o token, should be address in the form 0xA....
  152. */
  153. static inline unsigned long eeh_token_to_phys(unsigned long token)
  154. {
  155. pte_t *ptep;
  156. unsigned long pa;
  157. ptep = find_linux_pte(init_mm.pgd, token);
  158. if (!ptep)
  159. return token;
  160. pa = pte_pfn(*ptep) << PAGE_SHIFT;
  161. return pa | (token & (PAGE_SIZE-1));
  162. }
  163. /**
  164. * Return the "partitionable endpoint" (pe) under which this device lies
  165. */
  166. struct device_node * find_device_pe(struct device_node *dn)
  167. {
  168. while ((dn->parent) && PCI_DN(dn->parent) &&
  169. (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
  170. dn = dn->parent;
  171. }
  172. return dn;
  173. }
  174. /** Mark all devices that are peers of this device as failed.
  175. * Mark the device driver too, so that it can see the failure
  176. * immediately; this is critical, since some drivers poll
  177. * status registers in interrupts ... If a driver is polling,
  178. * and the slot is frozen, then the driver can deadlock in
  179. * an interrupt context, which is bad.
  180. */
  181. static void __eeh_mark_slot (struct device_node *dn, int mode_flag)
  182. {
  183. while (dn) {
  184. if (PCI_DN(dn)) {
  185. /* Mark the pci device driver too */
  186. struct pci_dev *dev = PCI_DN(dn)->pcidev;
  187. PCI_DN(dn)->eeh_mode |= mode_flag;
  188. if (dev && dev->driver)
  189. dev->error_state = pci_channel_io_frozen;
  190. if (dn->child)
  191. __eeh_mark_slot (dn->child, mode_flag);
  192. }
  193. dn = dn->sibling;
  194. }
  195. }
  196. void eeh_mark_slot (struct device_node *dn, int mode_flag)
  197. {
  198. struct pci_dev *dev;
  199. dn = find_device_pe (dn);
  200. /* Back up one, since config addrs might be shared */
  201. if (PCI_DN(dn) && PCI_DN(dn)->eeh_pe_config_addr)
  202. dn = dn->parent;
  203. PCI_DN(dn)->eeh_mode |= mode_flag;
  204. /* Mark the pci device too */
  205. dev = PCI_DN(dn)->pcidev;
  206. if (dev)
  207. dev->error_state = pci_channel_io_frozen;
  208. __eeh_mark_slot (dn->child, mode_flag);
  209. }
  210. static void __eeh_clear_slot (struct device_node *dn, int mode_flag)
  211. {
  212. while (dn) {
  213. if (PCI_DN(dn)) {
  214. PCI_DN(dn)->eeh_mode &= ~mode_flag;
  215. PCI_DN(dn)->eeh_check_count = 0;
  216. if (dn->child)
  217. __eeh_clear_slot (dn->child, mode_flag);
  218. }
  219. dn = dn->sibling;
  220. }
  221. }
  222. void eeh_clear_slot (struct device_node *dn, int mode_flag)
  223. {
  224. unsigned long flags;
  225. spin_lock_irqsave(&confirm_error_lock, flags);
  226. dn = find_device_pe (dn);
  227. /* Back up one, since config addrs might be shared */
  228. if (PCI_DN(dn) && PCI_DN(dn)->eeh_pe_config_addr)
  229. dn = dn->parent;
  230. PCI_DN(dn)->eeh_mode &= ~mode_flag;
  231. PCI_DN(dn)->eeh_check_count = 0;
  232. __eeh_clear_slot (dn->child, mode_flag);
  233. spin_unlock_irqrestore(&confirm_error_lock, flags);
  234. }
  235. /**
  236. * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
  237. * @dn device node
  238. * @dev pci device, if known
  239. *
  240. * Check for an EEH failure for the given device node. Call this
  241. * routine if the result of a read was all 0xff's and you want to
  242. * find out if this is due to an EEH slot freeze. This routine
  243. * will query firmware for the EEH status.
  244. *
  245. * Returns 0 if there has not been an EEH error; otherwise returns
  246. * a non-zero value and queues up a slot isolation event notification.
  247. *
  248. * It is safe to call this routine in an interrupt context.
  249. */
  250. int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
  251. {
  252. int ret;
  253. int rets[3];
  254. unsigned long flags;
  255. struct pci_dn *pdn;
  256. enum pci_channel_state state;
  257. int rc = 0;
  258. total_mmio_ffs++;
  259. if (!eeh_subsystem_enabled)
  260. return 0;
  261. if (!dn) {
  262. no_dn++;
  263. return 0;
  264. }
  265. pdn = PCI_DN(dn);
  266. /* Access to IO BARs might get this far and still not want checking. */
  267. if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
  268. pdn->eeh_mode & EEH_MODE_NOCHECK) {
  269. ignored_check++;
  270. #ifdef DEBUG
  271. printk ("EEH:ignored check (%x) for %s %s\n",
  272. pdn->eeh_mode, pci_name (dev), dn->full_name);
  273. #endif
  274. return 0;
  275. }
  276. if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) {
  277. no_cfg_addr++;
  278. return 0;
  279. }
  280. /* If we already have a pending isolation event for this
  281. * slot, we know it's bad already, we don't need to check.
  282. * Do this checking under a lock; as multiple PCI devices
  283. * in one slot might report errors simultaneously, and we
  284. * only want one error recovery routine running.
  285. */
  286. spin_lock_irqsave(&confirm_error_lock, flags);
  287. rc = 1;
  288. if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
  289. pdn->eeh_check_count ++;
  290. if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
  291. printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
  292. pdn->eeh_check_count);
  293. dump_stack();
  294. msleep(5000);
  295. /* re-read the slot reset state */
  296. if (read_slot_reset_state(pdn, rets) != 0)
  297. rets[0] = -1; /* reset state unknown */
  298. /* If we are here, then we hit an infinite loop. Stop. */
  299. panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
  300. }
  301. goto dn_unlock;
  302. }
  303. /*
  304. * Now test for an EEH failure. This is VERY expensive.
  305. * Note that the eeh_config_addr may be a parent device
  306. * in the case of a device behind a bridge, or it may be
  307. * function zero of a multi-function device.
  308. * In any case they must share a common PHB.
  309. */
  310. ret = read_slot_reset_state(pdn, rets);
  311. /* If the call to firmware failed, punt */
  312. if (ret != 0) {
  313. printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
  314. ret, dn->full_name);
  315. false_positives++;
  316. rc = 0;
  317. goto dn_unlock;
  318. }
  319. /* If EEH is not supported on this device, punt. */
  320. if (rets[1] != 1) {
  321. printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
  322. ret, dn->full_name);
  323. false_positives++;
  324. rc = 0;
  325. goto dn_unlock;
  326. }
  327. /* If not the kind of error we know about, punt. */
  328. if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
  329. false_positives++;
  330. rc = 0;
  331. goto dn_unlock;
  332. }
  333. /* Note that config-io to empty slots may fail;
  334. * we recognize empty because they don't have children. */
  335. if ((rets[0] == 5) && (dn->child == NULL)) {
  336. false_positives++;
  337. rc = 0;
  338. goto dn_unlock;
  339. }
  340. slot_resets++;
  341. /* Avoid repeated reports of this failure, including problems
  342. * with other functions on this device, and functions under
  343. * bridges. */
  344. eeh_mark_slot (dn, EEH_MODE_ISOLATED);
  345. spin_unlock_irqrestore(&confirm_error_lock, flags);
  346. state = pci_channel_io_normal;
  347. if ((rets[0] == 2) || (rets[0] == 4))
  348. state = pci_channel_io_frozen;
  349. if (rets[0] == 5)
  350. state = pci_channel_io_perm_failure;
  351. eeh_send_failure_event (dn, dev, state, rets[2]);
  352. /* Most EEH events are due to device driver bugs. Having
  353. * a stack trace will help the device-driver authors figure
  354. * out what happened. So print that out. */
  355. if (rets[0] != 5) dump_stack();
  356. return 1;
  357. dn_unlock:
  358. spin_unlock_irqrestore(&confirm_error_lock, flags);
  359. return rc;
  360. }
  361. EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
  362. /**
  363. * eeh_check_failure - check if all 1's data is due to EEH slot freeze
  364. * @token i/o token, should be address in the form 0xA....
  365. * @val value, should be all 1's (XXX why do we need this arg??)
  366. *
  367. * Check for an EEH failure at the given token address. Call this
  368. * routine if the result of a read was all 0xff's and you want to
  369. * find out if this is due to an EEH slot freeze event. This routine
  370. * will query firmware for the EEH status.
  371. *
  372. * Note this routine is safe to call in an interrupt context.
  373. */
  374. unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
  375. {
  376. unsigned long addr;
  377. struct pci_dev *dev;
  378. struct device_node *dn;
  379. /* Finding the phys addr + pci device; this is pretty quick. */
  380. addr = eeh_token_to_phys((unsigned long __force) token);
  381. dev = pci_get_device_by_addr(addr);
  382. if (!dev) {
  383. no_device++;
  384. return val;
  385. }
  386. dn = pci_device_to_OF_node(dev);
  387. eeh_dn_check_failure (dn, dev);
  388. pci_dev_put(dev);
  389. return val;
  390. }
  391. EXPORT_SYMBOL(eeh_check_failure);
  392. /* ------------------------------------------------------------- */
  393. /* The code below deals with error recovery */
  394. /**
  395. * eeh_slot_availability - returns error status of slot
  396. * @pdn pci device node
  397. *
  398. * Return negative value if a permanent error, else return
  399. * a number of milliseconds to wait until the PCI slot is
  400. * ready to be used.
  401. */
  402. static int
  403. eeh_slot_availability(struct pci_dn *pdn)
  404. {
  405. int rc;
  406. int rets[3];
  407. rc = read_slot_reset_state(pdn, rets);
  408. if (rc) return rc;
  409. if (rets[1] == 0) return -1; /* EEH is not supported */
  410. if (rets[0] == 0) return 0; /* Oll Korrect */
  411. if (rets[0] == 5) {
  412. if (rets[2] == 0) return -1; /* permanently unavailable */
  413. return rets[2]; /* number of millisecs to wait */
  414. }
  415. if (rets[0] == 1)
  416. return 250;
  417. printk (KERN_ERR "EEH: Slot unavailable: rc=%d, rets=%d %d %d\n",
  418. rc, rets[0], rets[1], rets[2]);
  419. return -2;
  420. }
  421. /**
  422. * rtas_pci_enable - enable MMIO or DMA transfers for this slot
  423. * @pdn pci device node
  424. */
  425. int
  426. rtas_pci_enable(struct pci_dn *pdn, int function)
  427. {
  428. int config_addr;
  429. int rc;
  430. /* Use PE configuration address, if present */
  431. config_addr = pdn->eeh_config_addr;
  432. if (pdn->eeh_pe_config_addr)
  433. config_addr = pdn->eeh_pe_config_addr;
  434. rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
  435. config_addr,
  436. BUID_HI(pdn->phb->buid),
  437. BUID_LO(pdn->phb->buid),
  438. function);
  439. if (rc)
  440. printk(KERN_WARNING "EEH: Cannot enable function %d, err=%d dn=%s\n",
  441. function, rc, pdn->node->full_name);
  442. return rc;
  443. }
  444. /**
  445. * rtas_pci_slot_reset - raises/lowers the pci #RST line
  446. * @pdn pci device node
  447. * @state: 1/0 to raise/lower the #RST
  448. *
  449. * Clear the EEH-frozen condition on a slot. This routine
  450. * asserts the PCI #RST line if the 'state' argument is '1',
  451. * and drops the #RST line if 'state is '0'. This routine is
  452. * safe to call in an interrupt context.
  453. *
  454. */
  455. static void
  456. rtas_pci_slot_reset(struct pci_dn *pdn, int state)
  457. {
  458. int config_addr;
  459. int rc;
  460. BUG_ON (pdn==NULL);
  461. if (!pdn->phb) {
  462. printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",
  463. pdn->node->full_name);
  464. return;
  465. }
  466. /* Use PE configuration address, if present */
  467. config_addr = pdn->eeh_config_addr;
  468. if (pdn->eeh_pe_config_addr)
  469. config_addr = pdn->eeh_pe_config_addr;
  470. rc = rtas_call(ibm_set_slot_reset,4,1, NULL,
  471. config_addr,
  472. BUID_HI(pdn->phb->buid),
  473. BUID_LO(pdn->phb->buid),
  474. state);
  475. if (rc)
  476. printk (KERN_WARNING "EEH: Unable to reset the failed slot,"
  477. " (%d) #RST=%d dn=%s\n",
  478. rc, state, pdn->node->full_name);
  479. }
  480. /**
  481. * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
  482. * @pdn: pci device node to be reset.
  483. *
  484. * Return 0 if success, else a non-zero value.
  485. */
  486. static void __rtas_set_slot_reset(struct pci_dn *pdn)
  487. {
  488. rtas_pci_slot_reset (pdn, 1);
  489. /* The PCI bus requires that the reset be held high for at least
  490. * a 100 milliseconds. We wait a bit longer 'just in case'. */
  491. #define PCI_BUS_RST_HOLD_TIME_MSEC 250
  492. msleep (PCI_BUS_RST_HOLD_TIME_MSEC);
  493. /* We might get hit with another EEH freeze as soon as the
  494. * pci slot reset line is dropped. Make sure we don't miss
  495. * these, and clear the flag now. */
  496. eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED);
  497. rtas_pci_slot_reset (pdn, 0);
  498. /* After a PCI slot has been reset, the PCI Express spec requires
  499. * a 1.5 second idle time for the bus to stabilize, before starting
  500. * up traffic. */
  501. #define PCI_BUS_SETTLE_TIME_MSEC 1800
  502. msleep (PCI_BUS_SETTLE_TIME_MSEC);
  503. }
  504. int rtas_set_slot_reset(struct pci_dn *pdn)
  505. {
  506. int i, rc;
  507. __rtas_set_slot_reset(pdn);
  508. /* Now double check with the firmware to make sure the device is
  509. * ready to be used; if not, wait for recovery. */
  510. for (i=0; i<10; i++) {
  511. rc = eeh_slot_availability (pdn);
  512. if (rc == 0)
  513. return 0;
  514. if (rc == -2) {
  515. printk (KERN_ERR "EEH: failed (%d) to reset slot %s\n",
  516. i, pdn->node->full_name);
  517. __rtas_set_slot_reset(pdn);
  518. continue;
  519. }
  520. if (rc < 0) {
  521. printk (KERN_ERR "EEH: unrecoverable slot failure %s\n",
  522. pdn->node->full_name);
  523. return -1;
  524. }
  525. msleep (rc+100);
  526. }
  527. rc = eeh_slot_availability (pdn);
  528. if (rc)
  529. printk (KERN_ERR "EEH: timeout resetting slot %s\n", pdn->node->full_name);
  530. return rc;
  531. }
  532. /* ------------------------------------------------------- */
  533. /** Save and restore of PCI BARs
  534. *
  535. * Although firmware will set up BARs during boot, it doesn't
  536. * set up device BAR's after a device reset, although it will,
  537. * if requested, set up bridge configuration. Thus, we need to
  538. * configure the PCI devices ourselves.
  539. */
  540. /**
  541. * __restore_bars - Restore the Base Address Registers
  542. * @pdn: pci device node
  543. *
  544. * Loads the PCI configuration space base address registers,
  545. * the expansion ROM base address, the latency timer, and etc.
  546. * from the saved values in the device node.
  547. */
  548. static inline void __restore_bars (struct pci_dn *pdn)
  549. {
  550. int i;
  551. if (NULL==pdn->phb) return;
  552. for (i=4; i<10; i++) {
  553. rtas_write_config(pdn, i*4, 4, pdn->config_space[i]);
  554. }
  555. /* 12 == Expansion ROM Address */
  556. rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]);
  557. #define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
  558. #define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)])
  559. rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1,
  560. SAVED_BYTE(PCI_CACHE_LINE_SIZE));
  561. rtas_write_config (pdn, PCI_LATENCY_TIMER, 1,
  562. SAVED_BYTE(PCI_LATENCY_TIMER));
  563. /* max latency, min grant, interrupt pin and line */
  564. rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]);
  565. }
  566. /**
  567. * eeh_restore_bars - restore the PCI config space info
  568. *
  569. * This routine performs a recursive walk to the children
  570. * of this device as well.
  571. */
  572. void eeh_restore_bars(struct pci_dn *pdn)
  573. {
  574. struct device_node *dn;
  575. if (!pdn)
  576. return;
  577. if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code))
  578. __restore_bars (pdn);
  579. dn = pdn->node->child;
  580. while (dn) {
  581. eeh_restore_bars (PCI_DN(dn));
  582. dn = dn->sibling;
  583. }
  584. }
  585. /**
  586. * eeh_save_bars - save device bars
  587. *
  588. * Save the values of the device bars. Unlike the restore
  589. * routine, this routine is *not* recursive. This is because
  590. * PCI devices are added individuallly; but, for the restore,
  591. * an entire slot is reset at a time.
  592. */
  593. static void eeh_save_bars(struct pci_dn *pdn)
  594. {
  595. int i;
  596. if (!pdn )
  597. return;
  598. for (i = 0; i < 16; i++)
  599. rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]);
  600. }
  601. void
  602. rtas_configure_bridge(struct pci_dn *pdn)
  603. {
  604. int config_addr;
  605. int rc;
  606. /* Use PE configuration address, if present */
  607. config_addr = pdn->eeh_config_addr;
  608. if (pdn->eeh_pe_config_addr)
  609. config_addr = pdn->eeh_pe_config_addr;
  610. rc = rtas_call(ibm_configure_bridge,3,1, NULL,
  611. config_addr,
  612. BUID_HI(pdn->phb->buid),
  613. BUID_LO(pdn->phb->buid));
  614. if (rc) {
  615. printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
  616. rc, pdn->node->full_name);
  617. }
  618. }
  619. /* ------------------------------------------------------------- */
  620. /* The code below deals with enabling EEH for devices during the
  621. * early boot sequence. EEH must be enabled before any PCI probing
  622. * can be done.
  623. */
  624. #define EEH_ENABLE 1
  625. struct eeh_early_enable_info {
  626. unsigned int buid_hi;
  627. unsigned int buid_lo;
  628. };
  629. /* Enable eeh for the given device node. */
  630. static void *early_enable_eeh(struct device_node *dn, void *data)
  631. {
  632. unsigned int rets[3];
  633. struct eeh_early_enable_info *info = data;
  634. int ret;
  635. const char *status = get_property(dn, "status", NULL);
  636. const u32 *class_code = get_property(dn, "class-code", NULL);
  637. const u32 *vendor_id = get_property(dn, "vendor-id", NULL);
  638. const u32 *device_id = get_property(dn, "device-id", NULL);
  639. const u32 *regs;
  640. int enable;
  641. struct pci_dn *pdn = PCI_DN(dn);
  642. pdn->class_code = 0;
  643. pdn->eeh_mode = 0;
  644. pdn->eeh_check_count = 0;
  645. pdn->eeh_freeze_count = 0;
  646. if (status && strcmp(status, "ok") != 0)
  647. return NULL; /* ignore devices with bad status */
  648. /* Ignore bad nodes. */
  649. if (!class_code || !vendor_id || !device_id)
  650. return NULL;
  651. /* There is nothing to check on PCI to ISA bridges */
  652. if (dn->type && !strcmp(dn->type, "isa")) {
  653. pdn->eeh_mode |= EEH_MODE_NOCHECK;
  654. return NULL;
  655. }
  656. pdn->class_code = *class_code;
  657. /*
  658. * Now decide if we are going to "Disable" EEH checking
  659. * for this device. We still run with the EEH hardware active,
  660. * but we won't be checking for ff's. This means a driver
  661. * could return bad data (very bad!), an interrupt handler could
  662. * hang waiting on status bits that won't change, etc.
  663. * But there are a few cases like display devices that make sense.
  664. */
  665. enable = 1; /* i.e. we will do checking */
  666. #if 0
  667. if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)
  668. enable = 0;
  669. #endif
  670. if (!enable)
  671. pdn->eeh_mode |= EEH_MODE_NOCHECK;
  672. /* Ok... see if this device supports EEH. Some do, some don't,
  673. * and the only way to find out is to check each and every one. */
  674. regs = get_property(dn, "reg", NULL);
  675. if (regs) {
  676. /* First register entry is addr (00BBSS00) */
  677. /* Try to enable eeh */
  678. ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
  679. regs[0], info->buid_hi, info->buid_lo,
  680. EEH_ENABLE);
  681. enable = 0;
  682. if (ret == 0) {
  683. pdn->eeh_config_addr = regs[0];
  684. /* If the newer, better, ibm,get-config-addr-info is supported,
  685. * then use that instead. */
  686. pdn->eeh_pe_config_addr = 0;
  687. if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
  688. ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets,
  689. pdn->eeh_config_addr,
  690. info->buid_hi, info->buid_lo,
  691. 0);
  692. if (ret == 0)
  693. pdn->eeh_pe_config_addr = rets[0];
  694. }
  695. /* Some older systems (Power4) allow the
  696. * ibm,set-eeh-option call to succeed even on nodes
  697. * where EEH is not supported. Verify support
  698. * explicitly. */
  699. ret = read_slot_reset_state(pdn, rets);
  700. if ((ret == 0) && (rets[1] == 1))
  701. enable = 1;
  702. }
  703. if (enable) {
  704. eeh_subsystem_enabled = 1;
  705. pdn->eeh_mode |= EEH_MODE_SUPPORTED;
  706. #ifdef DEBUG
  707. printk(KERN_DEBUG "EEH: %s: eeh enabled, config=%x pe_config=%x\n",
  708. dn->full_name, pdn->eeh_config_addr, pdn->eeh_pe_config_addr);
  709. #endif
  710. } else {
  711. /* This device doesn't support EEH, but it may have an
  712. * EEH parent, in which case we mark it as supported. */
  713. if (dn->parent && PCI_DN(dn->parent)
  714. && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
  715. /* Parent supports EEH. */
  716. pdn->eeh_mode |= EEH_MODE_SUPPORTED;
  717. pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr;
  718. return NULL;
  719. }
  720. }
  721. } else {
  722. printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
  723. dn->full_name);
  724. }
  725. eeh_save_bars(pdn);
  726. return NULL;
  727. }
  728. /*
  729. * Initialize EEH by trying to enable it for all of the adapters in the system.
  730. * As a side effect we can determine here if eeh is supported at all.
  731. * Note that we leave EEH on so failed config cycles won't cause a machine
  732. * check. If a user turns off EEH for a particular adapter they are really
  733. * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't
  734. * grant access to a slot if EEH isn't enabled, and so we always enable
  735. * EEH for all slots/all devices.
  736. *
  737. * The eeh-force-off option disables EEH checking globally, for all slots.
  738. * Even if force-off is set, the EEH hardware is still enabled, so that
  739. * newer systems can boot.
  740. */
  741. void __init eeh_init(void)
  742. {
  743. struct device_node *phb, *np;
  744. struct eeh_early_enable_info info;
  745. spin_lock_init(&confirm_error_lock);
  746. spin_lock_init(&slot_errbuf_lock);
  747. np = of_find_node_by_path("/rtas");
  748. if (np == NULL)
  749. return;
  750. ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
  751. ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
  752. ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
  753. ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
  754. ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
  755. ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
  756. ibm_configure_bridge = rtas_token ("ibm,configure-bridge");
  757. if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
  758. return;
  759. eeh_error_buf_size = rtas_token("rtas-error-log-max");
  760. if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
  761. eeh_error_buf_size = 1024;
  762. }
  763. if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
  764. printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "
  765. "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
  766. eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
  767. }
  768. /* Enable EEH for all adapters. Note that eeh requires buid's */
  769. for (phb = of_find_node_by_name(NULL, "pci"); phb;
  770. phb = of_find_node_by_name(phb, "pci")) {
  771. unsigned long buid;
  772. buid = get_phb_buid(phb);
  773. if (buid == 0 || PCI_DN(phb) == NULL)
  774. continue;
  775. info.buid_lo = BUID_LO(buid);
  776. info.buid_hi = BUID_HI(buid);
  777. traverse_pci_devices(phb, early_enable_eeh, &info);
  778. }
  779. if (eeh_subsystem_enabled)
  780. printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
  781. else
  782. printk(KERN_WARNING "EEH: No capable adapters found\n");
  783. }
  784. /**
  785. * eeh_add_device_early - enable EEH for the indicated device_node
  786. * @dn: device node for which to set up EEH
  787. *
  788. * This routine must be used to perform EEH initialization for PCI
  789. * devices that were added after system boot (e.g. hotplug, dlpar).
  790. * This routine must be called before any i/o is performed to the
  791. * adapter (inluding any config-space i/o).
  792. * Whether this actually enables EEH or not for this device depends
  793. * on the CEC architecture, type of the device, on earlier boot
  794. * command-line arguments & etc.
  795. */
  796. static void eeh_add_device_early(struct device_node *dn)
  797. {
  798. struct pci_controller *phb;
  799. struct eeh_early_enable_info info;
  800. if (!dn || !PCI_DN(dn))
  801. return;
  802. phb = PCI_DN(dn)->phb;
  803. /* USB Bus children of PCI devices will not have BUID's */
  804. if (NULL == phb || 0 == phb->buid)
  805. return;
  806. info.buid_hi = BUID_HI(phb->buid);
  807. info.buid_lo = BUID_LO(phb->buid);
  808. early_enable_eeh(dn, &info);
  809. }
  810. void eeh_add_device_tree_early(struct device_node *dn)
  811. {
  812. struct device_node *sib;
  813. for (sib = dn->child; sib; sib = sib->sibling)
  814. eeh_add_device_tree_early(sib);
  815. eeh_add_device_early(dn);
  816. }
  817. EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
  818. /**
  819. * eeh_add_device_late - perform EEH initialization for the indicated pci device
  820. * @dev: pci device for which to set up EEH
  821. *
  822. * This routine must be used to complete EEH initialization for PCI
  823. * devices that were added after system boot (e.g. hotplug, dlpar).
  824. */
  825. static void eeh_add_device_late(struct pci_dev *dev)
  826. {
  827. struct device_node *dn;
  828. struct pci_dn *pdn;
  829. if (!dev || !eeh_subsystem_enabled)
  830. return;
  831. #ifdef DEBUG
  832. printk(KERN_DEBUG "EEH: adding device %s\n", pci_name(dev));
  833. #endif
  834. pci_dev_get (dev);
  835. dn = pci_device_to_OF_node(dev);
  836. pdn = PCI_DN(dn);
  837. pdn->pcidev = dev;
  838. pci_addr_cache_insert_device (dev);
  839. }
  840. void eeh_add_device_tree_late(struct pci_bus *bus)
  841. {
  842. struct pci_dev *dev;
  843. list_for_each_entry(dev, &bus->devices, bus_list) {
  844. eeh_add_device_late(dev);
  845. if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
  846. struct pci_bus *subbus = dev->subordinate;
  847. if (subbus)
  848. eeh_add_device_tree_late(subbus);
  849. }
  850. }
  851. }
  852. EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
  853. /**
  854. * eeh_remove_device - undo EEH setup for the indicated pci device
  855. * @dev: pci device to be removed
  856. *
  857. * This routine should be called when a device is removed from
  858. * a running system (e.g. by hotplug or dlpar). It unregisters
  859. * the PCI device from the EEH subsystem. I/O errors affecting
  860. * this device will no longer be detected after this call; thus,
  861. * i/o errors affecting this slot may leave this device unusable.
  862. */
  863. static void eeh_remove_device(struct pci_dev *dev)
  864. {
  865. struct device_node *dn;
  866. if (!dev || !eeh_subsystem_enabled)
  867. return;
  868. /* Unregister the device with the EEH/PCI address search system */
  869. #ifdef DEBUG
  870. printk(KERN_DEBUG "EEH: remove device %s\n", pci_name(dev));
  871. #endif
  872. pci_addr_cache_remove_device(dev);
  873. dn = pci_device_to_OF_node(dev);
  874. if (PCI_DN(dn)->pcidev) {
  875. PCI_DN(dn)->pcidev = NULL;
  876. pci_dev_put (dev);
  877. }
  878. }
  879. void eeh_remove_bus_device(struct pci_dev *dev)
  880. {
  881. struct pci_bus *bus = dev->subordinate;
  882. struct pci_dev *child, *tmp;
  883. eeh_remove_device(dev);
  884. if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
  885. list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
  886. eeh_remove_bus_device(child);
  887. }
  888. }
  889. EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
  890. static int proc_eeh_show(struct seq_file *m, void *v)
  891. {
  892. if (0 == eeh_subsystem_enabled) {
  893. seq_printf(m, "EEH Subsystem is globally disabled\n");
  894. seq_printf(m, "eeh_total_mmio_ffs=%ld\n", total_mmio_ffs);
  895. } else {
  896. seq_printf(m, "EEH Subsystem is enabled\n");
  897. seq_printf(m,
  898. "no device=%ld\n"
  899. "no device node=%ld\n"
  900. "no config address=%ld\n"
  901. "check not wanted=%ld\n"
  902. "eeh_total_mmio_ffs=%ld\n"
  903. "eeh_false_positives=%ld\n"
  904. "eeh_ignored_failures=%ld\n"
  905. "eeh_slot_resets=%ld\n",
  906. no_device, no_dn, no_cfg_addr,
  907. ignored_check, total_mmio_ffs,
  908. false_positives, ignored_failures,
  909. slot_resets);
  910. }
  911. return 0;
  912. }
  913. static int proc_eeh_open(struct inode *inode, struct file *file)
  914. {
  915. return single_open(file, proc_eeh_show, NULL);
  916. }
  917. static struct file_operations proc_eeh_operations = {
  918. .open = proc_eeh_open,
  919. .read = seq_read,
  920. .llseek = seq_lseek,
  921. .release = single_release,
  922. };
  923. static int __init eeh_init_proc(void)
  924. {
  925. struct proc_dir_entry *e;
  926. if (machine_is(pseries)) {
  927. e = create_proc_entry("ppc64/eeh", 0, NULL);
  928. if (e)
  929. e->proc_fops = &proc_eeh_operations;
  930. }
  931. return 0;
  932. }
  933. __initcall(eeh_init_proc);