aboutsummaryrefslogtreecommitdiff
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
authorGavin Shan <shangw@linux.vnet.ibm.com>2013-06-20 13:21:01 +0800
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-06-20 17:06:04 +1000
commit5a71978e4b6ee6a01bc6aab926a3571055123029 (patch)
treed879a16b3d1d78e90b84070633375064fa4c5a0c /arch/powerpc/kernel
parentc86085580d5f60d2d3cea9c60d50e284558d3de7 (diff)
powerpc/eeh: Trace time on first error for PE
We're not expecting that one specific PE got frozen for over 5 times in last hour. Otherwise, the PE will be removed from the system upon newly coming EEH errors. The patch introduces time stamp to trace the first error on specific PE in last hour and function to update that accordingly. Besides, the time stamp is recovered during PE hotplug path as we did for frozen count. Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/eeh_driver.c5
-rw-r--r--arch/powerpc/kernel/eeh_pe.c27
2 files changed, 32 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index fb927af9a9e..678bc6cddf8 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -349,10 +349,12 @@ static void *eeh_report_failure(void *data, void *userdata)
*/
static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
{
+ struct timeval tstamp;
int cnt, rc;
/* pcibios will clear the counter; save the value */
cnt = pe->freeze_count;
+ tstamp = pe->tstamp;
/*
* We don't remove the corresponding PE instances because
@@ -385,6 +387,8 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
ssleep(5);
pcibios_add_pci_devices(bus);
}
+
+ pe->tstamp = tstamp;
pe->freeze_count = cnt;
return 0;
@@ -425,6 +429,7 @@ void eeh_handle_event(struct eeh_pe *pe)
return;
}
+ eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
goto excess_failures;
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index c96366758ac..ae75722c058 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -482,6 +482,33 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
}
/**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+ struct timeval tstamp;
+
+ if (!pe) return;
+
+ if (pe->freeze_count <= 0) {
+ pe->freeze_count = 0;
+ do_gettimeofday(&pe->tstamp);
+ } else {
+ do_gettimeofday(&tstamp);
+ if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+ pe->tstamp = tstamp;
+ pe->freeze_count = 0;
+ }
+ }
+}
+
+/**
* __eeh_pe_state_mark - Mark the state for the PE
* @data: EEH PE
* @flag: state