devlink: Add health report functionality
Upon error discover, every driver can report it to the devlink health mechanism via devlink_health_report function, using the appropriate reporter registered to it. Driver can pass error specific context which will be delivered to it as part of the dump / recovery callbacks. Once an error is reported, devlink health will do the following actions: * A log is being send to the kernel trace events buffer * Health status and statistics are being updated for the reporter instance * Object dump is being taken and stored at the reporter instance (as long as there is no other dump which is already stored) * Auto recovery attempt is being done. depends on: - Auto Recovery configuration - Grace period vs. time since last recover Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Reviewed-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:

committed by
David S. Miller

parent
880ee82f03
commit
c7af343b4e
@@ -46,6 +46,65 @@ TRACE_EVENT(devlink_hwmsg,
|
||||
(int) __entry->len, __get_dynamic_array(buf), __entry->len)
|
||||
);
|
||||
|
||||
TRACE_EVENT(devlink_health_report,
|
||||
TP_PROTO(const struct devlink *devlink, const char *reporter_name,
|
||||
const char *msg),
|
||||
|
||||
TP_ARGS(devlink, reporter_name, msg),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__string(bus_name, devlink->dev->bus->name)
|
||||
__string(dev_name, dev_name(devlink->dev))
|
||||
__string(driver_name, devlink->dev->driver->name)
|
||||
__string(reporter_name, msg)
|
||||
__string(msg, msg)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(bus_name, devlink->dev->bus->name);
|
||||
__assign_str(dev_name, dev_name(devlink->dev));
|
||||
__assign_str(driver_name, devlink->dev->driver->name);
|
||||
__assign_str(reporter_name, reporter_name);
|
||||
__assign_str(msg, msg);
|
||||
),
|
||||
|
||||
TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s",
|
||||
__get_str(bus_name), __get_str(dev_name),
|
||||
__get_str(driver_name), __get_str(reporter_name),
|
||||
__get_str(msg))
|
||||
);
|
||||
|
||||
TRACE_EVENT(devlink_health_recover_aborted,
|
||||
TP_PROTO(const struct devlink *devlink, const char *reporter_name,
|
||||
bool health_state, u64 time_since_last_recover),
|
||||
|
||||
TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__string(bus_name, devlink->dev->bus->name)
|
||||
__string(dev_name, dev_name(devlink->dev))
|
||||
__string(driver_name, devlink->dev->driver->name)
|
||||
__string(reporter_name, reporter_name)
|
||||
__field(bool, health_state)
|
||||
__field(u64, time_since_last_recover)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(bus_name, devlink->dev->bus->name);
|
||||
__assign_str(dev_name, dev_name(devlink->dev));
|
||||
__assign_str(driver_name, devlink->dev->driver->name);
|
||||
__assign_str(reporter_name, reporter_name);
|
||||
__entry->health_state = health_state;
|
||||
__entry->time_since_last_recover = time_since_last_recover;
|
||||
),
|
||||
|
||||
TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: health_state=%d time_since_last_recover = %llu recover aborted",
|
||||
__get_str(bus_name), __get_str(dev_name),
|
||||
__get_str(driver_name), __get_str(reporter_name),
|
||||
__entry->health_state,
|
||||
__entry->time_since_last_recover)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_DEVLINK_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
@@ -64,6 +123,9 @@ static inline void trace_devlink_hwmsg(const struct devlink *devlink,
|
||||
{
|
||||
}
|
||||
|
||||
static inline void trace_devlink_health(const char *msg)
|
||||
{
|
||||
}
|
||||
#endif /* _TRACE_DEVLINK_H */
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user