[PATCH] powerpc/pseries: clear PCI failure counter if no new failures
The current PCI error recovery system keeps track of the number of PCI card resets, and refuses to bring a card back up if this number is too large. The goal of doing this was to avoid an infinite loop of resets if a card is obviously dead. However, if the failures are rare, but the machine has a high uptime, this mechanism might still be triggered; this is too harsh. This patch will avoids this problem by decrementing the fail count after an hour. Thus, as long as a pci card BSOD's less than 6 times an hour, it will continue to be reset indefinitely. If it's failure rate is greater than that, it will be taken off-line permanently. This patch is larger than it might otherwise be because it changes indentation by removing a pointless while-loop. The while loop is not needed, as the handler is invoked once fo each event (by schedule_work()); the loop is leftover cruft from an earlier implementation. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
This commit is contained in:
parent
4bd174fe1c
commit
ac325acd50
|
@ -23,9 +23,8 @@
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
#include <linux/delay.h>
|
#include <linux/delay.h>
|
||||||
#include <linux/irq.h>
|
|
||||||
#include <linux/interrupt.h>
|
#include <linux/interrupt.h>
|
||||||
#include <linux/notifier.h>
|
#include <linux/irq.h>
|
||||||
#include <linux/pci.h>
|
#include <linux/pci.h>
|
||||||
#include <asm/eeh.h>
|
#include <asm/eeh.h>
|
||||||
#include <asm/eeh_event.h>
|
#include <asm/eeh_event.h>
|
||||||
|
@ -250,7 +249,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
|
||||||
*/
|
*/
|
||||||
#define MAX_WAIT_FOR_RECOVERY 15
|
#define MAX_WAIT_FOR_RECOVERY 15
|
||||||
|
|
||||||
void handle_eeh_events (struct eeh_event *event)
|
struct pci_dn * handle_eeh_events (struct eeh_event *event)
|
||||||
{
|
{
|
||||||
struct device_node *frozen_dn;
|
struct device_node *frozen_dn;
|
||||||
struct pci_dn *frozen_pdn;
|
struct pci_dn *frozen_pdn;
|
||||||
|
@ -265,7 +264,7 @@ void handle_eeh_events (struct eeh_event *event)
|
||||||
if (!frozen_dn) {
|
if (!frozen_dn) {
|
||||||
printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n",
|
printk(KERN_ERR "EEH: Error: Cannot find partition endpoint for %s\n",
|
||||||
pci_name(event->dev));
|
pci_name(event->dev));
|
||||||
return;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* There are two different styles for coming up with the PE.
|
/* There are two different styles for coming up with the PE.
|
||||||
|
@ -280,7 +279,7 @@ void handle_eeh_events (struct eeh_event *event)
|
||||||
if (!frozen_bus) {
|
if (!frozen_bus) {
|
||||||
printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n",
|
printk(KERN_ERR "EEH: Cannot find PCI bus for %s\n",
|
||||||
frozen_dn->full_name);
|
frozen_dn->full_name);
|
||||||
return;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -355,7 +354,7 @@ void handle_eeh_events (struct eeh_event *event)
|
||||||
/* Tell all device drivers that they can resume operations */
|
/* Tell all device drivers that they can resume operations */
|
||||||
pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
|
pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
|
||||||
|
|
||||||
return;
|
return frozen_pdn;
|
||||||
|
|
||||||
excess_failures:
|
excess_failures:
|
||||||
/*
|
/*
|
||||||
|
@ -384,6 +383,8 @@ perm_error:
|
||||||
|
|
||||||
/* Shut down the device drivers for good. */
|
/* Shut down the device drivers for good. */
|
||||||
pcibios_remove_pci_devices(frozen_bus);
|
pcibios_remove_pci_devices(frozen_bus);
|
||||||
|
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------- end of file ---------- */
|
/* ---------- end of file ---------- */
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
|
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <linux/delay.h>
|
||||||
#include <linux/list.h>
|
#include <linux/list.h>
|
||||||
#include <linux/mutex.h>
|
#include <linux/mutex.h>
|
||||||
#include <linux/pci.h>
|
#include <linux/pci.h>
|
||||||
|
@ -56,38 +57,43 @@ static int eeh_event_handler(void * dummy)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
struct eeh_event *event;
|
struct eeh_event *event;
|
||||||
|
struct pci_dn *pdn;
|
||||||
|
|
||||||
daemonize ("eehd");
|
daemonize ("eehd");
|
||||||
|
set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
|
||||||
while (1) {
|
spin_lock_irqsave(&eeh_eventlist_lock, flags);
|
||||||
set_current_state(TASK_INTERRUPTIBLE);
|
event = NULL;
|
||||||
|
|
||||||
spin_lock_irqsave(&eeh_eventlist_lock, flags);
|
/* Unqueue the event, get ready to process. */
|
||||||
event = NULL;
|
if (!list_empty(&eeh_eventlist)) {
|
||||||
|
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
|
||||||
|
list_del(&event->list);
|
||||||
|
}
|
||||||
|
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
|
||||||
|
|
||||||
/* Unqueue the event, get ready to process. */
|
if (event == NULL)
|
||||||
if (!list_empty(&eeh_eventlist)) {
|
return 0;
|
||||||
event = list_entry(eeh_eventlist.next, struct eeh_event, list);
|
|
||||||
list_del(&event->list);
|
|
||||||
}
|
|
||||||
spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
|
|
||||||
|
|
||||||
if (event == NULL)
|
/* Serialize processing of EEH events */
|
||||||
break;
|
mutex_lock(&eeh_event_mutex);
|
||||||
|
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
|
||||||
|
|
||||||
/* Serialize processing of EEH events */
|
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
|
||||||
mutex_lock(&eeh_event_mutex);
|
pci_name(event->dev));
|
||||||
eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
|
|
||||||
|
|
||||||
printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
|
pdn = handle_eeh_events(event);
|
||||||
pci_name(event->dev));
|
|
||||||
|
|
||||||
handle_eeh_events(event);
|
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
|
||||||
|
pci_dev_put(event->dev);
|
||||||
|
kfree(event);
|
||||||
|
mutex_unlock(&eeh_event_mutex);
|
||||||
|
|
||||||
eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
|
/* If there are no new errors after an hour, clear the counter. */
|
||||||
pci_dev_put(event->dev);
|
if (pdn && pdn->eeh_freeze_count>0) {
|
||||||
kfree(event);
|
msleep_interruptible (3600*1000);
|
||||||
mutex_unlock(&eeh_event_mutex);
|
if (pdn->eeh_freeze_count>0)
|
||||||
|
pdn->eeh_freeze_count--;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -18,8 +18,8 @@
|
||||||
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
|
* Copyright (c) 2005 Linas Vepstas <linas@linas.org>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef ASM_PPC64_EEH_EVENT_H
|
#ifndef ASM_POWERPC_EEH_EVENT_H
|
||||||
#define ASM_PPC64_EEH_EVENT_H
|
#define ASM_POWERPC_EEH_EVENT_H
|
||||||
#ifdef __KERNEL__
|
#ifdef __KERNEL__
|
||||||
|
|
||||||
/** EEH event -- structure holding pci controller data that describes
|
/** EEH event -- structure holding pci controller data that describes
|
||||||
|
@ -39,7 +39,7 @@ struct eeh_event {
|
||||||
* @dev pci device
|
* @dev pci device
|
||||||
*
|
*
|
||||||
* This routine builds a PCI error event which will be delivered
|
* This routine builds a PCI error event which will be delivered
|
||||||
* to all listeners on the peh_notifier_chain.
|
* to all listeners on the eeh_notifier_chain.
|
||||||
*
|
*
|
||||||
* This routine can be called within an interrupt context;
|
* This routine can be called within an interrupt context;
|
||||||
* the actual event will be delivered in a normal context
|
* the actual event will be delivered in a normal context
|
||||||
|
@ -51,7 +51,7 @@ int eeh_send_failure_event (struct device_node *dn,
|
||||||
int time_unavail);
|
int time_unavail);
|
||||||
|
|
||||||
/* Main recovery function */
|
/* Main recovery function */
|
||||||
void handle_eeh_events (struct eeh_event *);
|
struct pci_dn * handle_eeh_events (struct eeh_event *);
|
||||||
|
|
||||||
#endif /* __KERNEL__ */
|
#endif /* __KERNEL__ */
|
||||||
#endif /* ASM_PPC64_EEH_EVENT_H */
|
#endif /* ASM_POWERPC_EEH_EVENT_H */
|
||||||
|
|
Loading…
Reference in New Issue