Add error-event support in XE drm_ras to notify userspace when an error occurs. $ sudo ynl --family drm_ras --output-json --subscribe error-notify { "name": "error-event", "msg": { "device-name": "0000:03:00.0", "node-id": 1, "node-name": "uncorrectable-errors", "error-id": 1, "error-name": "core-compute", "error-value": 1 } } Signed-off-by: Riana Tauro --- v2: use ynl (Raag) use value as function parameter move error event call to hw_error_source_handler --- drivers/gpu/drm/xe/xe_drm_ras.c | 26 ++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_drm_ras.h | 3 +++ drivers/gpu/drm/xe/xe_hw_error.c | 5 ++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c index cd236f53699e..09588e3fd984 100644 --- a/drivers/gpu/drm/xe/xe_drm_ras.c +++ b/drivers/gpu/drm/xe/xe_drm_ras.c @@ -174,6 +174,32 @@ static int register_nodes(struct xe_device *xe) return ret; } +/** + * xe_drm_ras_event() - Notify userspace of an error event + * @xe: xe device structure + * @component: error component (see &enum drm_xe_ras_error_component) + * @severity: error severity (see &enum drm_xe_ras_error_severity) + * @value: value of error counter + * @flags: flags for allocation + * + * Notifies userspace of an error. + */ +void xe_drm_ras_event(struct xe_device *xe, u32 component, u32 severity, u32 value, gfp_t flags) +{ + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[severity]; + struct drm_ras_node *node = &ras->node[severity]; + int ret; + + if (!info || !info[component].name) + return; + + ret = drm_ras_nl_error_event(node, component, info[component].name, value, flags); + if (ret) + drm_err(&xe->drm, "RAS error-event failed: %d for %s %s\n", ret, + info[component].name, error_severity[severity]); +} + /** * xe_drm_ras_init() - Initialize DRM RAS * @xe: xe device instance diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h index 365c70e93e82..2a694bf69478 100644 --- a/drivers/gpu/drm/xe/xe_drm_ras.h +++ b/drivers/gpu/drm/xe/xe_drm_ras.h @@ -5,11 +5,14 @@ #ifndef _XE_DRM_RAS_H_ #define _XE_DRM_RAS_H_ +#include + struct xe_device; #define for_each_error_severity(i) \ for (i = 0; i < DRM_XE_RAS_ERR_SEV_MAX; i++) int xe_drm_ras_init(struct xe_device *xe); +void xe_drm_ras_event(struct xe_device *xe, u32 component, u32 severity, u32 value, gfp_t flags); #endif diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 4b72959b2276..6e3306dedb24 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -432,7 +432,7 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er struct xe_drm_ras *ras = &xe->ras; struct xe_drm_ras_counter *info = ras->info[severity]; unsigned long flags, err_src; - u32 err_bit; + u32 err_bit, value; if (!IS_DGFX(xe)) return; @@ -485,6 +485,9 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er gt_hw_error_handler(tile, hw_err, error_id); if (err_bit == XE_SOC_ERROR) soc_hw_error_handler(tile, hw_err, error_id); + + value = atomic_read(&info[error_id].counter); + xe_drm_ras_event(xe, error_id, severity, value, GFP_ATOMIC); } clear_reg: -- 2.47.1