Counter helpers deal with counter values. Use the appropriate naming to match with their functionality. Signed-off-by: Raag Jadav --- drivers/gpu/drm/drm_ras.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c index d6eab29a1394..03db53d03329 100644 --- a/drivers/gpu/drm/drm_ras.c +++ b/drivers/gpu/drm/drm_ras.c @@ -168,8 +168,8 @@ static int get_node_error_counter(u32 node_id, u32 error_id, return node->query_error_counter(node, error_id, name, value); } -static int msg_reply_value(struct sk_buff *msg, u32 error_id, - const char *error_name, u32 value) +static int msg_reply_counter_value(struct sk_buff *msg, u32 error_id, + const char *error_name, u32 value) { int ret; @@ -186,8 +186,8 @@ static int msg_reply_value(struct sk_buff *msg, u32 error_id, value); } -static int doit_reply_value(struct genl_info *info, u32 node_id, - u32 error_id) +static int doit_reply_counter_value(struct genl_info *info, u32 node_id, + u32 error_id) { struct sk_buff *msg; struct nlattr *hdr; @@ -210,7 +210,7 @@ static int doit_reply_value(struct genl_info *info, u32 node_id, if (ret) return ret; - ret = msg_reply_value(msg, error_id, error_name, value); + ret = msg_reply_counter_value(msg, error_id, error_name, value); if (ret) { genlmsg_cancel(msg, hdr); nlmsg_free(msg); @@ -278,7 +278,7 @@ int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, break; } - ret = msg_reply_value(skb, error_id, error_name, value); + ret = msg_reply_counter_value(skb, error_id, error_name, value); if (ret) { genlmsg_cancel(skb, hdr); break; @@ -317,7 +317,7 @@ int drm_ras_nl_get_error_counter_doit(struct sk_buff *skb, node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); - return doit_reply_value(info, node_id, error_id); + return doit_reply_counter_value(info, node_id, error_id); } /** -- 2.43.0 Add get-error-threshold command support which allows querying threshold value of an error. Threshold in RAS context means the number of errors the hardware is expected to accumulate before it raises them to software. This is to have a fine grained control over error notifications that are raised by the hardware. Signed-off-by: Raag Jadav --- v2: Document threshold definition (Riana) Return -EOPNOTSUPP on threshold callbacks absence (Riana) Cancel and free genlmsg on failure (Riana) --- Documentation/gpu/drm-ras.rst | 9 ++ Documentation/netlink/specs/drm_ras.yaml | 28 +++++- drivers/gpu/drm/drm_ras.c | 109 +++++++++++++++++++++++ drivers/gpu/drm/drm_ras_nl.c | 13 +++ drivers/gpu/drm/drm_ras_nl.h | 2 + include/drm/drm_ras.h | 15 ++++ include/uapi/drm/drm_ras.h | 2 + 7 files changed, 176 insertions(+), 2 deletions(-) diff --git a/Documentation/gpu/drm-ras.rst b/Documentation/gpu/drm-ras.rst index 4636e68f5678..dfa72e8becda 100644 --- a/Documentation/gpu/drm-ras.rst +++ b/Documentation/gpu/drm-ras.rst @@ -54,6 +54,8 @@ User space tools can: ``node-id`` and ``error-id`` as parameters. * Clear specific error counters with the ``clear-error-counter`` command, using both ``node-id`` and ``error-id`` as parameters. +* Query specific error threshold value with the ``get-error-threshold`` command, using both + ``node-id`` and ``error-id`` as parameters. YAML-based Interface -------------------- @@ -109,3 +111,10 @@ Example: Clear an error counter for a given node sudo ynl --family drm_ras --do clear-error-counter --json '{"node-id":0, "error-id":1}' None + +Example: Query threshold value of a given error + +.. code-block:: bash + + sudo ynl --family drm_ras --do get-error-threshold --json '{"node-id":0, "error-id":1}' + {'error-id': 1, 'error-name': 'error_name1', 'error-threshold': 16} diff --git a/Documentation/netlink/specs/drm_ras.yaml b/Documentation/netlink/specs/drm_ras.yaml index e113056f8c01..016d713069bb 100644 --- a/Documentation/netlink/specs/drm_ras.yaml +++ b/Documentation/netlink/specs/drm_ras.yaml @@ -8,8 +8,10 @@ doc: >- DRM RAS (Reliability, Availability, Serviceability) over Generic Netlink. Provides a standardized mechanism for DRM drivers to register "nodes" representing hardware/software components capable of reporting error counters. - Userspace tools can query the list of nodes or individual error counters - via the Generic Netlink interface. + Userspace tools can query the list of nodes or individual error counters or + their thresholds via the Generic Netlink interface. Threshold in RAS context + means the number of errors the hardware is expected to accumulate before it + raises them to software. definitions: - @@ -69,6 +71,10 @@ attribute-sets: name: error-value type: u32 doc: Current value of the requested error counter. + - + name: error-threshold + type: u32 + doc: Threshold value of the error counter. operations: list: @@ -124,3 +130,21 @@ operations: do: request: attributes: *id-attrs + - + name: get-error-threshold + doc: >- + Retrieve threshold value of an error. + The response includes the id, the name, and current threshold + value of the error. + attribute-set: error-counter-attrs + flags: [admin-perm] + do: + request: + attributes: + - node-id + - error-id + reply: + attributes: + - error-id + - error-name + - error-threshold diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c index 03db53d03329..87e57bd1e8ad 100644 --- a/drivers/gpu/drm/drm_ras.c +++ b/drivers/gpu/drm/drm_ras.c @@ -41,6 +41,10 @@ * Userspace must provide Node ID, Error ID. * Clears specific error counter of a node if supported. * + * 4. GET_ERROR_THRESHOLD: Query threshold value of an error. + * Userspace must provide Node ID and Error ID. + * Returns the threshold value of a specific error. + * * Node registration: * * - drm_ras_node_register(): Registers a new node and assigns @@ -72,6 +76,8 @@ * operation, fetching a counter value from a specific node. * - drm_ras_nl_clear_error_counter_doit(): Implements the CLEAR_ERROR_COUNTER doit * operation, clearing a counter value from a specific node. + * - drm_ras_nl_get_error_threshold_doit(): Implements the GET_ERROR_THRESHOLD doit + * operation, fetching the threshold value of a specific error. */ static DEFINE_XARRAY_ALLOC(drm_ras_xa); @@ -168,6 +174,25 @@ static int get_node_error_counter(u32 node_id, u32 error_id, return node->query_error_counter(node, error_id, name, value); } +static int get_node_error_threshold(u32 node_id, u32 error_id, + const char **name, u32 *value) +{ + struct drm_ras_node *node; + + node = xa_load(&drm_ras_xa, node_id); + if (!node) + return -ENOENT; + + if (!node->query_error_threshold) + return -EOPNOTSUPP; + + if (error_id < node->error_counter_range.first || + error_id > node->error_counter_range.last) + return -EINVAL; + + return node->query_error_threshold(node, error_id, name, value); +} + static int msg_reply_counter_value(struct sk_buff *msg, u32 error_id, const char *error_name, u32 value) { @@ -186,6 +211,24 @@ static int msg_reply_counter_value(struct sk_buff *msg, u32 error_id, value); } +static int msg_reply_threshold_value(struct sk_buff *msg, u32 error_id, + const char *error_name, u32 value) +{ + int ret; + + ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id); + if (ret) + return ret; + + ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, + error_name); + if (ret) + return ret; + + return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD, + value); +} + static int doit_reply_counter_value(struct genl_info *info, u32 node_id, u32 error_id) { @@ -222,6 +265,45 @@ static int doit_reply_counter_value(struct genl_info *info, u32 node_id, return genlmsg_reply(msg, info); } +static int doit_reply_threshold_value(struct genl_info *info, u32 node_id, + u32 error_id) +{ + struct sk_buff *msg; + struct nlattr *hdr; + const char *error_name; + u32 value; + int ret; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_iput(msg, info); + if (!hdr) { + nlmsg_free(msg); + return -EMSGSIZE; + } + + ret = get_node_error_threshold(node_id, error_id, + &error_name, &value); + if (ret) { + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + return ret; + } + + ret = msg_reply_threshold_value(msg, error_id, error_name, value); + if (ret) { + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); +} + /** * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters * @skb: Netlink message buffer @@ -355,6 +437,33 @@ int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb, return node->clear_error_counter(node, error_id); } +/** + * drm_ras_nl_get_error_threshold_doit() - Query threshold value of an error + * @skb: Netlink message buffer + * @info: Generic Netlink info containing attributes of the request + * + * Extracts the Node ID and Error ID from the netlink attributes and + * retrieves the threshold value of the corresponding error. Sends the + * result back to the requesting user via the standard Genl reply. + * + * Return: 0 on success, or negative errno on failure. + */ +int drm_ras_nl_get_error_threshold_doit(struct sk_buff *skb, + struct genl_info *info) +{ + u32 node_id, error_id; + + if (!info->attrs || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) + return -EINVAL; + + node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); + error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); + + return doit_reply_threshold_value(info, node_id, error_id); +} + /** * drm_ras_node_register() - Register a new RAS node * @node: Node structure to register diff --git a/drivers/gpu/drm/drm_ras_nl.c b/drivers/gpu/drm/drm_ras_nl.c index dea1c1b2494e..ecec2041c758 100644 --- a/drivers/gpu/drm/drm_ras_nl.c +++ b/drivers/gpu/drm/drm_ras_nl.c @@ -28,6 +28,12 @@ static const struct nla_policy drm_ras_clear_error_counter_nl_policy[DRM_RAS_A_E [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, }; +/* DRM_RAS_CMD_GET_ERROR_THRESHOLD - do */ +static const struct nla_policy drm_ras_get_error_threshold_nl_policy[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID + 1] = { + [DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID] = { .type = NLA_U32, }, + [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, +}; + /* Ops table for drm_ras */ static const struct genl_split_ops drm_ras_nl_ops[] = { { @@ -56,6 +62,13 @@ static const struct genl_split_ops drm_ras_nl_ops[] = { .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = DRM_RAS_CMD_GET_ERROR_THRESHOLD, + .doit = drm_ras_nl_get_error_threshold_doit, + .policy = drm_ras_get_error_threshold_nl_policy, + .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; struct genl_family drm_ras_nl_family __ro_after_init = { diff --git a/drivers/gpu/drm/drm_ras_nl.h b/drivers/gpu/drm/drm_ras_nl.h index a398643572a5..399280c2c6e1 100644 --- a/drivers/gpu/drm/drm_ras_nl.h +++ b/drivers/gpu/drm/drm_ras_nl.h @@ -20,6 +20,8 @@ int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb, struct genl_info *info); +int drm_ras_nl_get_error_threshold_doit(struct sk_buff *skb, + struct genl_info *info); extern struct genl_family drm_ras_nl_family; diff --git a/include/drm/drm_ras.h b/include/drm/drm_ras.h index f2a787bc4f64..7bb429d85f57 100644 --- a/include/drm/drm_ras.h +++ b/include/drm/drm_ras.h @@ -69,6 +69,21 @@ struct drm_ras_node { */ int (*clear_error_counter)(struct drm_ras_node *node, u32 error_id); + /** + * @query_error_threshold: + * + * This callback is used by drm-ras to query threshold value of a + * specific error. + * + * Driver should expect query_error_threshold() to be called with + * error_id from `error_counter_range.first` to + * `error_counter_range.last`. + * + * Returns: 0 on success, negative error code on failure. + */ + int (*query_error_threshold)(struct drm_ras_node *node, u32 error_id, + const char **name, u32 *val); + /** @priv: Driver private data */ void *priv; }; diff --git a/include/uapi/drm/drm_ras.h b/include/uapi/drm/drm_ras.h index 218a3ee86805..59530f987ba2 100644 --- a/include/uapi/drm/drm_ras.h +++ b/include/uapi/drm/drm_ras.h @@ -33,6 +33,7 @@ enum { DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, + DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD, __DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX, DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX = (__DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX - 1) @@ -42,6 +43,7 @@ enum { DRM_RAS_CMD_LIST_NODES = 1, DRM_RAS_CMD_GET_ERROR_COUNTER, DRM_RAS_CMD_CLEAR_ERROR_COUNTER, + DRM_RAS_CMD_GET_ERROR_THRESHOLD, __DRM_RAS_CMD_MAX, DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1) -- 2.43.0 Add set-error-threshold command support which allows setting threshold value of the error. Threshold in RAS context means the number of errors the hardware is expected to accumulate before it raises them to software. This is to have a fine grained control over error notifications that are raised by the hardware. Signed-off-by: Raag Jadav --- v2: Return -EOPNOTSUPP on threshold callbacks absence (Riana) Document threshold value bounds checking responsibility (Riana) --- Documentation/gpu/drm-ras.rst | 9 +++++ Documentation/netlink/specs/drm_ras.yaml | 12 ++++++ drivers/gpu/drm/drm_ras.c | 51 ++++++++++++++++++++++++ drivers/gpu/drm/drm_ras_nl.c | 14 +++++++ drivers/gpu/drm/drm_ras_nl.h | 2 + include/drm/drm_ras.h | 14 +++++++ include/uapi/drm/drm_ras.h | 1 + 7 files changed, 103 insertions(+) diff --git a/Documentation/gpu/drm-ras.rst b/Documentation/gpu/drm-ras.rst index dfa72e8becda..07a33d16bad9 100644 --- a/Documentation/gpu/drm-ras.rst +++ b/Documentation/gpu/drm-ras.rst @@ -56,6 +56,8 @@ User space tools can: ``node-id`` and ``error-id`` as parameters. * Query specific error threshold value with the ``get-error-threshold`` command, using both ``node-id`` and ``error-id`` as parameters. +* Set specific error threshold value with the ``set-error-threshold`` command, using + ``node-id``, ``error-id`` and ``error-threshold`` as parameters. YAML-based Interface -------------------- @@ -118,3 +120,10 @@ Example: Query threshold value of a given error sudo ynl --family drm_ras --do get-error-threshold --json '{"node-id":0, "error-id":1}' {'error-id': 1, 'error-name': 'error_name1', 'error-threshold': 16} + +Example: Set threshold value of a given error + +.. code-block:: bash + + sudo ynl --family drm_ras --do set-error-threshold --json '{"node-id":0, "error-id":1, "error-threshold":8}' + None diff --git a/Documentation/netlink/specs/drm_ras.yaml b/Documentation/netlink/specs/drm_ras.yaml index 016d713069bb..ba7e0a944e7d 100644 --- a/Documentation/netlink/specs/drm_ras.yaml +++ b/Documentation/netlink/specs/drm_ras.yaml @@ -148,3 +148,15 @@ operations: - error-id - error-name - error-threshold + - + name: set-error-threshold + doc: >- + Set threshold value of an error. + attribute-set: error-counter-attrs + flags: [admin-perm] + do: + request: + attributes: + - node-id + - error-id + - error-threshold diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c index 87e57bd1e8ad..f351147489ca 100644 --- a/drivers/gpu/drm/drm_ras.c +++ b/drivers/gpu/drm/drm_ras.c @@ -45,6 +45,9 @@ * Userspace must provide Node ID and Error ID. * Returns the threshold value of a specific error. * + * 5. SET_ERROR_THRESHOLD: Set threshold value of an error. + * Userspace must provide Node ID, Error ID and threshold value to be set. + * * Node registration: * * - drm_ras_node_register(): Registers a new node and assigns @@ -78,6 +81,8 @@ * operation, clearing a counter value from a specific node. * - drm_ras_nl_get_error_threshold_doit(): Implements the GET_ERROR_THRESHOLD doit * operation, fetching the threshold value of a specific error. + * - drm_ras_nl_set_error_threshold_doit(): Implements the SET_ERROR_THRESHOLD doit + * operation, setting the threshold value of a specific error. */ static DEFINE_XARRAY_ALLOC(drm_ras_xa); @@ -193,6 +198,24 @@ static int get_node_error_threshold(u32 node_id, u32 error_id, return node->query_error_threshold(node, error_id, name, value); } +static int set_node_error_threshold(u32 node_id, u32 error_id, u32 value) +{ + struct drm_ras_node *node; + + node = xa_load(&drm_ras_xa, node_id); + if (!node) + return -ENOENT; + + if (!node->set_error_threshold) + return -EOPNOTSUPP; + + if (error_id < node->error_counter_range.first || + error_id > node->error_counter_range.last) + return -EINVAL; + + return node->set_error_threshold(node, error_id, value); +} + static int msg_reply_counter_value(struct sk_buff *msg, u32 error_id, const char *error_name, u32 value) { @@ -464,6 +487,34 @@ int drm_ras_nl_get_error_threshold_doit(struct sk_buff *skb, return doit_reply_threshold_value(info, node_id, error_id); } +/** + * drm_ras_nl_set_error_threshold_doit() - Set threshold value of an error + * @skb: Netlink message buffer + * @info: Generic Netlink info containing attributes of the request + * + * Extracts the Node ID, Error ID and threshold value from the netlink attributes + * and sets the threshold of the corresponding error. + * + * Return: 0 on success, or negative errno on failure. + */ +int drm_ras_nl_set_error_threshold_doit(struct sk_buff *skb, + struct genl_info *info) +{ + u32 node_id, error_id, value; + + if (!info->attrs || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID) || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD)) + return -EINVAL; + + node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); + error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); + value = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD]); + + return set_node_error_threshold(node_id, error_id, value); +} + /** * drm_ras_node_register() - Register a new RAS node * @node: Node structure to register diff --git a/drivers/gpu/drm/drm_ras_nl.c b/drivers/gpu/drm/drm_ras_nl.c index ecec2041c758..02e8e5054d05 100644 --- a/drivers/gpu/drm/drm_ras_nl.c +++ b/drivers/gpu/drm/drm_ras_nl.c @@ -34,6 +34,13 @@ static const struct nla_policy drm_ras_get_error_threshold_nl_policy[DRM_RAS_A_E [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, }; +/* DRM_RAS_CMD_SET_ERROR_THRESHOLD - do */ +static const struct nla_policy drm_ras_set_error_threshold_nl_policy[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD + 1] = { + [DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID] = { .type = NLA_U32, }, + [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, + [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD] = { .type = NLA_U32, }, +}; + /* Ops table for drm_ras */ static const struct genl_split_ops drm_ras_nl_ops[] = { { @@ -69,6 +76,13 @@ static const struct genl_split_ops drm_ras_nl_ops[] = { .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = DRM_RAS_CMD_SET_ERROR_THRESHOLD, + .doit = drm_ras_nl_set_error_threshold_doit, + .policy = drm_ras_set_error_threshold_nl_policy, + .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; struct genl_family drm_ras_nl_family __ro_after_init = { diff --git a/drivers/gpu/drm/drm_ras_nl.h b/drivers/gpu/drm/drm_ras_nl.h index 399280c2c6e1..57b1e647d833 100644 --- a/drivers/gpu/drm/drm_ras_nl.h +++ b/drivers/gpu/drm/drm_ras_nl.h @@ -22,6 +22,8 @@ int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb, struct genl_info *info); int drm_ras_nl_get_error_threshold_doit(struct sk_buff *skb, struct genl_info *info); +int drm_ras_nl_set_error_threshold_doit(struct sk_buff *skb, + struct genl_info *info); extern struct genl_family drm_ras_nl_family; diff --git a/include/drm/drm_ras.h b/include/drm/drm_ras.h index 7bb429d85f57..134e20a16abc 100644 --- a/include/drm/drm_ras.h +++ b/include/drm/drm_ras.h @@ -83,6 +83,20 @@ struct drm_ras_node { */ int (*query_error_threshold)(struct drm_ras_node *node, u32 error_id, const char **name, u32 *val); + /** + * @set_error_threshold: + * + * This callback is used by drm-ras to set threshold value of a specific + * error. + * + * Driver should expect set_error_threshold() to be called with error_id + * from `error_counter_range.first` to `error_counter_range.last`. + * Driver is responsible for threshold value bounds checking. + * + * Returns: 0 on success, negative error code on failure. + */ + int (*set_error_threshold)(struct drm_ras_node *node, u32 error_id, + u32 val); /** @priv: Driver private data */ void *priv; diff --git a/include/uapi/drm/drm_ras.h b/include/uapi/drm/drm_ras.h index 59530f987ba2..27c68956495f 100644 --- a/include/uapi/drm/drm_ras.h +++ b/include/uapi/drm/drm_ras.h @@ -44,6 +44,7 @@ enum { DRM_RAS_CMD_GET_ERROR_COUNTER, DRM_RAS_CMD_CLEAR_ERROR_COUNTER, DRM_RAS_CMD_GET_ERROR_THRESHOLD, + DRM_RAS_CMD_SET_ERROR_THRESHOLD, __DRM_RAS_CMD_MAX, DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1) -- 2.43.0 From: Riana Tauro Add additional Error components supported by XE drm_ras (Reliability, Availability and Serviceability). Signed-off-by: Riana Tauro Reviewed-by: Aravind Iddamsetty Reviewed-by: Mallesh Koujalagi --- include/uapi/drm/xe_drm.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 48e9f1fdb78d..50c80af4ad4e 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -2589,6 +2589,12 @@ enum drm_xe_ras_error_component { DRM_XE_RAS_ERR_COMP_CORE_COMPUTE = 1, /** @DRM_XE_RAS_ERR_COMP_SOC_INTERNAL: SoC Internal Error */ DRM_XE_RAS_ERR_COMP_SOC_INTERNAL, + /** @DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY: Device Memory Error */ + DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY, + /** @DRM_XE_RAS_ERR_COMP_PCIE: PCIe Subsystem Error */ + DRM_XE_RAS_ERR_COMP_PCIE, + /** @DRM_XE_RAS_ERR_COMP_FABRIC: Fabric Subsystem Error */ + DRM_XE_RAS_ERR_COMP_FABRIC, /** @DRM_XE_RAS_ERR_COMP_MAX: Max Error */ DRM_XE_RAS_ERR_COMP_MAX /* non-ABI */ }; @@ -2606,7 +2612,10 @@ enum drm_xe_ras_error_component { */ #define DRM_XE_RAS_ERROR_COMPONENT_NAMES { \ [DRM_XE_RAS_ERR_COMP_CORE_COMPUTE] = "core-compute", \ - [DRM_XE_RAS_ERR_COMP_SOC_INTERNAL] = "soc-internal" \ + [DRM_XE_RAS_ERR_COMP_SOC_INTERNAL] = "soc-internal", \ + [DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY] = "device-memory", \ + [DRM_XE_RAS_ERR_COMP_PCIE] = "pcie", \ + [DRM_XE_RAS_ERR_COMP_FABRIC] = "fabric", \ } #if defined(__cplusplus) -- 2.43.0 System controller allows programming per error threshold value, which it uses to raise error events to the driver. Get it using mailbox command so that it can be exposed to the user. Signed-off-by: Raag Jadav --- drivers/gpu/drm/xe/xe_ras.c | 58 +++++++++++++++++++ drivers/gpu/drm/xe/xe_ras.h | 3 + drivers/gpu/drm/xe/xe_ras_types.h | 22 +++++++ drivers/gpu/drm/xe/xe_sysctrl_mailbox.c | 29 ++++++++++ drivers/gpu/drm/xe/xe_sysctrl_mailbox.h | 3 + drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h | 2 + 6 files changed, 117 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 4cb16b419b0c..434dea8bbdb1 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -4,11 +4,14 @@ */ #include "xe_device.h" +#include "xe_pm.h" #include "xe_printk.h" #include "xe_ras.h" #include "xe_ras_types.h" #include "xe_sysctrl.h" #include "xe_sysctrl_event_types.h" +#include "xe_sysctrl_mailbox.h" +#include "xe_sysctrl_mailbox_types.h" /* Severity of detected errors */ enum xe_ras_severity { @@ -50,6 +53,23 @@ static const char *const xe_ras_components[] = { }; static_assert(ARRAY_SIZE(xe_ras_components) == XE_RAS_COMP_MAX); +/* uAPI mapping */ +static const int drm_to_xe_ras_components[] = { + [DRM_XE_RAS_ERR_COMP_CORE_COMPUTE] = XE_RAS_COMP_CORE_COMPUTE, + [DRM_XE_RAS_ERR_COMP_SOC_INTERNAL] = XE_RAS_COMP_SOC_INTERNAL, + [DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY] = XE_RAS_COMP_DEVICE_MEMORY, + [DRM_XE_RAS_ERR_COMP_PCIE] = XE_RAS_COMP_PCIE, + [DRM_XE_RAS_ERR_COMP_FABRIC] = XE_RAS_COMP_FABRIC, +}; +static_assert(ARRAY_SIZE(drm_to_xe_ras_components) == DRM_XE_RAS_ERR_COMP_MAX); + +/* uAPI mapping */ +static const int drm_to_xe_ras_severities[] = { + [DRM_XE_RAS_ERR_SEV_CORRECTABLE] = XE_RAS_SEV_CORRECTABLE, + [DRM_XE_RAS_ERR_SEV_UNCORRECTABLE] = XE_RAS_SEV_UNCORRECTABLE, +}; +static_assert(ARRAY_SIZE(drm_to_xe_ras_severities) == DRM_XE_RAS_ERR_SEV_MAX); + static inline const char *sev_to_str(u8 severity) { if (severity >= XE_RAS_SEV_MAX) @@ -91,3 +111,41 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe, comp_to_str(component), sev_to_str(severity)); } } + +int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold) +{ + struct xe_ras_get_threshold_response response = {}; + struct xe_ras_get_threshold_request request = {}; + struct xe_sysctrl_mailbox_command command = {}; + struct xe_ras_error_class counter = {}; + size_t len; + int ret; + + counter.common.severity = drm_to_xe_ras_severities[severity]; + counter.common.component = drm_to_xe_ras_components[component]; + request.counter = counter; + + xe_sysctrl_populate_command(&command, &request, &response, sizeof(request), + sizeof(response), XE_SYSCTRL_GROUP_GFSP, + XE_SYSCTRL_CMD_GET_THRESHOLD); + + guard(xe_pm_runtime)(xe); + ret = xe_sysctrl_send_command(&xe->sc, &command, &len); + if (ret) { + xe_err(xe, "sysctrl: failed to get threshold %d\n", ret); + return ret; + } + + if (len != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected get threshold response length %zu (expected %zu)\n", + len, sizeof(response)); + return -EIO; + } + + counter = response.counter; + *threshold = response.threshold; + + xe_dbg(xe, "[RAS]: get threshold %u for %s %s\n", response.threshold, + comp_to_str(counter.common.component), sev_to_str(counter.common.severity)); + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index ea90593b62dc..982bbe61461e 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -6,10 +6,13 @@ #ifndef _XE_RAS_H_ #define _XE_RAS_H_ +#include + struct xe_device; struct xe_sysctrl_event_response; void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response); +int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold); #endif diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h index 4e63c67f806a..c29e9a3d43ce 100644 --- a/drivers/gpu/drm/xe/xe_ras_types.h +++ b/drivers/gpu/drm/xe/xe_ras_types.h @@ -70,4 +70,26 @@ struct xe_ras_threshold_crossed { struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS]; } __packed; +/** + * struct xe_ras_get_threshold_request - Request structure for get threshold + */ +struct xe_ras_get_threshold_request { + /** @counter: Counter to get threshold for */ + struct xe_ras_error_class counter; + /** @reserved: Reserved for future use */ + u32 reserved; +} __packed; + +/** + * struct xe_ras_get_threshold_response - Response structure for get threshold + */ +struct xe_ras_get_threshold_response { + /** @counter: Counter ID */ + struct xe_ras_error_class counter; + /** @threshold: Threshold value */ + u32 threshold; + /** @reserved: Reserved for future use */ + u32 reserved[4]; +} __packed; + #endif diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c index 3caa9f15875f..dc4cadd50ee8 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.c @@ -307,6 +307,35 @@ void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc) sc->phase_bit = (ctrl_reg & SYSCTRL_FRAME_PHASE) ? 1 : 0; } +/** + * xe_sysctrl_populate_command() - Populate System Controller command structure + * @command: System Controller command structure + * @request: Pointer to request structure + * @response: Pointer to response structure + * @request_len: Length of request structure + * @response_len: Length of response structure + * @group_id: Group ID to be used with command + * @cmd_id: Command ID to be used with command + * + * Helper for mailbox users to populate command structure fields to be later + * sent to xe_sysctrl_send_command(). + */ +void xe_sysctrl_populate_command(struct xe_sysctrl_mailbox_command *command, void *request, + void *response, size_t request_len, size_t response_len, + u8 group_id, u8 cmd_id) +{ + struct xe_sysctrl_app_msg_hdr header = {}; + + header.data = FIELD_PREP(APP_HDR_GROUP_ID_MASK, group_id) | + FIELD_PREP(APP_HDR_COMMAND_MASK, cmd_id); + + command->header = header; + command->data_in = request; + command->data_in_len = request_len; + command->data_out = response; + command->data_out_len = response_len; +} + /** * xe_sysctrl_send_command() - Send mailbox command to System Controller * @sc: System Controller instance diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h index f67e9234de48..5a4a0fed304f 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox.h @@ -24,6 +24,9 @@ struct xe_sysctrl_mailbox_command; FIELD_GET(APP_HDR_VERSION_MASK, (hdr)->data) void xe_sysctrl_mailbox_init(struct xe_sysctrl *sc); +void xe_sysctrl_populate_command(struct xe_sysctrl_mailbox_command *command, void *request, + void *response, size_t request_len, size_t response_len, + u8 group_id, u8 cmd_id); int xe_sysctrl_send_command(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_command *cmd, size_t *rdata_len); diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h index 84d7c647e743..a1b71218deca 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h @@ -22,9 +22,11 @@ enum xe_sysctrl_group { /** * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group * + * @XE_SYSCTRL_CMD_GET_THRESHOLD: Retrieve error threshold * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event */ enum xe_sysctrl_gfsp_cmd { + XE_SYSCTRL_CMD_GET_THRESHOLD = 0x05, XE_SYSCTRL_CMD_GET_PENDING_EVENT = 0x07, }; -- 2.43.0 System controller allows programming per error threshold value, which it uses to raise error events to the driver. Set it using mailbox command so that it can be programmed by the user. Signed-off-by: Raag Jadav --- v2: Add RAS operation status codes (Riana) --- drivers/gpu/drm/xe/xe_ras.c | 72 +++++++++++++++++++ drivers/gpu/drm/xe/xe_ras.h | 1 + drivers/gpu/drm/xe/xe_ras_types.h | 28 ++++++++ drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h | 2 + 4 files changed, 103 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 434dea8bbdb1..4548e5cb08b9 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -34,6 +34,17 @@ enum xe_ras_component { XE_RAS_COMP_MAX }; +/* RAS operation status codes */ +enum xe_ras_status { + XE_RAS_STATUS_SUCCESS = 0, + XE_RAS_STATUS_INVALID_PARAM, + XE_RAS_STATUS_NOT_SUPPORTED, + XE_RAS_STATUS_TIMEOUT, + XE_RAS_STATUS_HARDWARE_FAILURE, + XE_RAS_STATUS_INSUFFICIENT_RESOURCES, + XE_RAS_STATUS_MAX +}; + static const char *const xe_ras_severities[] = { [XE_RAS_SEV_NOT_SUPPORTED] = "Not Supported", [XE_RAS_SEV_CORRECTABLE] = "Correctable Error", @@ -70,6 +81,24 @@ static const int drm_to_xe_ras_severities[] = { }; static_assert(ARRAY_SIZE(drm_to_xe_ras_severities) == DRM_XE_RAS_ERR_SEV_MAX); +static int ras_status_to_errno(u32 status) +{ + switch (status) { + case XE_RAS_STATUS_INVALID_PARAM: + return -EINVAL; + case XE_RAS_STATUS_NOT_SUPPORTED: + return -EOPNOTSUPP; + case XE_RAS_STATUS_TIMEOUT: + return -ETIMEDOUT; + case XE_RAS_STATUS_HARDWARE_FAILURE: + return -EIO; + case XE_RAS_STATUS_INSUFFICIENT_RESOURCES: + return -ENOSPC; + default: + return -EPROTO; + } +}; + static inline const char *sev_to_str(u8 severity) { if (severity >= XE_RAS_SEV_MAX) @@ -149,3 +178,46 @@ int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 comp_to_str(counter.common.component), sev_to_str(counter.common.severity)); return 0; } + +int xe_ras_set_threshold(struct xe_device *xe, u32 severity, u32 component, u32 threshold) +{ + struct xe_ras_set_threshold_response response = {}; + struct xe_ras_set_threshold_request request = {}; + struct xe_sysctrl_mailbox_command command = {}; + struct xe_ras_error_class counter = {}; + size_t len; + int ret; + + counter.common.severity = drm_to_xe_ras_severities[severity]; + counter.common.component = drm_to_xe_ras_components[component]; + request.counter = counter; + request.threshold = threshold; + + xe_sysctrl_populate_command(&command, &request, &response, sizeof(request), + sizeof(response), XE_SYSCTRL_GROUP_GFSP, + XE_SYSCTRL_CMD_SET_THRESHOLD); + + guard(xe_pm_runtime)(xe); + ret = xe_sysctrl_send_command(&xe->sc, &command, &len); + if (ret) { + xe_err(xe, "sysctrl: failed to set threshold %d\n", ret); + return ret; + } + + if (len != sizeof(response)) { + xe_err(xe, "sysctrl: unexpected set threshold response length %zu (expected %zu)\n", + len, sizeof(response)); + return -EIO; + } + + if (response.status) { + xe_err(xe, "sysctrl: set threshold operation failed %#x\n", response.status); + return ras_status_to_errno(response.status); + } + + counter = response.counter; + + xe_dbg(xe, "[RAS]: set threshold %u for %s %s\n", response.threshold, + comp_to_str(counter.common.component), sev_to_str(counter.common.severity)); + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index 982bbe61461e..d1f71b1de723 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -14,5 +14,6 @@ struct xe_sysctrl_event_response; void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response); int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold); +int xe_ras_set_threshold(struct xe_device *xe, u32 severity, u32 component, u32 threshold); #endif diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h index c29e9a3d43ce..6047fd891022 100644 --- a/drivers/gpu/drm/xe/xe_ras_types.h +++ b/drivers/gpu/drm/xe/xe_ras_types.h @@ -92,4 +92,32 @@ struct xe_ras_get_threshold_response { u32 reserved[4]; } __packed; +/** + * struct xe_ras_set_threshold_request - Request structure for set threshold + */ +struct xe_ras_set_threshold_request { + /** @counter: Counter to set threshold for */ + struct xe_ras_error_class counter; + /** @threshold: Threshold value to set */ + u32 threshold; + /** @reserved: Reserved for future use */ + u32 reserved; +} __packed; + +/** + * struct xe_ras_set_threshold_response - Response structure for set threshold + */ +struct xe_ras_set_threshold_response { + /** @counter: Counter ID */ + struct xe_ras_error_class counter; + /** @threshold_prev: Previous threshold value */ + u32 threshold_prev; + /** @threshold: Updated threshold value */ + u32 threshold; + /** @status: Set threshold operation status */ + u32 status; + /** @reserved: Reserved for future use */ + u32 reserved[2]; +} __packed; + #endif diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h index a1b71218deca..b865768e903b 100644 --- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h +++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h @@ -23,10 +23,12 @@ enum xe_sysctrl_group { * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group * * @XE_SYSCTRL_CMD_GET_THRESHOLD: Retrieve error threshold + * @XE_SYSCTRL_CMD_SET_THRESHOLD: Set error threshold * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event */ enum xe_sysctrl_gfsp_cmd { XE_SYSCTRL_CMD_GET_THRESHOLD = 0x05, + XE_SYSCTRL_CMD_SET_THRESHOLD = 0x06, XE_SYSCTRL_CMD_GET_PENDING_EVENT = 0x07, }; -- 2.43.0 Now that we have get/set error threshold support in xe driver, wire them up to drm_ras so that userspace can make use of the functionality. $ sudo ynl --family drm_ras --do get-error-threshold \ --json '{"node-id":0, "error-id":2}' {'error-id': 2, 'error-name': 'soc-internal', 'error-threshold': 16} $ sudo ynl --family drm_ras --do set-error-threshold \ --json '{"node-id":0, "error-id":2, "error-threshold":8}' None Signed-off-by: Raag Jadav Reviewed-by: Riana Tauro --- drivers/gpu/drm/xe/xe_drm_ras.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c index c21c8b428de6..31780d8af7e9 100644 --- a/drivers/gpu/drm/xe/xe_drm_ras.c +++ b/drivers/gpu/drm/xe/xe_drm_ras.c @@ -11,6 +11,7 @@ #include "xe_device_types.h" #include "xe_drm_ras.h" +#include "xe_ras.h" static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES; static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES; @@ -75,6 +76,30 @@ static int clear_correctable_error_counter(struct drm_ras_node *node, u32 error_ return hw_clear_error_counter(info, error_id); } +static int query_correctable_error_threshold(struct drm_ras_node *ep, u32 error_id, + const char **name, u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[DRM_XE_RAS_ERR_SEV_CORRECTABLE]; + + if (!xe->info.has_sysctrl) + return -EOPNOTSUPP; + + *name = info[error_id].name; + return xe_ras_get_threshold(xe, DRM_XE_RAS_ERR_SEV_CORRECTABLE, error_id, val); +} + +static int set_correctable_error_threshold(struct drm_ras_node *ep, u32 error_id, u32 val) +{ + struct xe_device *xe = ep->priv; + + if (!xe->info.has_sysctrl) + return -EOPNOTSUPP; + + return xe_ras_set_threshold(xe, DRM_XE_RAS_ERR_SEV_CORRECTABLE, error_id, val); +} + static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe) { struct xe_drm_ras_counter *counter; @@ -123,6 +148,8 @@ static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) { node->query_error_counter = query_correctable_error_counter; node->clear_error_counter = clear_correctable_error_counter; + node->query_error_threshold = query_correctable_error_threshold; + node->set_error_threshold = set_correctable_error_threshold; } else { node->query_error_counter = query_uncorrectable_error_counter; node->clear_error_counter = clear_uncorrectable_error_counter; -- 2.43.0 From: Riana Tauro gove xe drm_ras registration to RAS initialization flow and keep gardware error initialization for processing errors reported via irq. Also reorder soc remapper and system controller initialization to early probe as ras init is dependent on both. Cc: Anoop Vijay Cc: Umesh Nerlige Ramappa Signed-off-by: Riana Tauro --- drivers/gpu/drm/xe/xe_device.c | 19 +++++++++++-------- drivers/gpu/drm/xe/xe_hw_error.c | 13 ------------- drivers/gpu/drm/xe/xe_ras.c | 20 ++++++++++++++++++++ drivers/gpu/drm/xe/xe_ras.h | 1 + 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 65f107ba1410..402504971e3d 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -63,6 +63,7 @@ #include "xe_psmi.h" #include "xe_pxp.h" #include "xe_query.h" +#include "xe_ras.h" #include "xe_shrinker.h" #include "xe_soc_remapper.h" #include "xe_survivability_mode.h" @@ -964,6 +965,16 @@ int xe_device_probe(struct xe_device *xe) if (err) return err; + err = xe_soc_remapper_init(xe); + if (err) + return err; + + err = xe_sysctrl_init(xe); + if (err) + return err; + + xe_ras_init(xe); + /* * Now that GT is initialized (TTM in particular), * we can try to init display, and inherit the initial fb. @@ -1004,10 +1015,6 @@ int xe_device_probe(struct xe_device *xe) xe_nvm_init(xe); - err = xe_soc_remapper_init(xe); - if (err) - return err; - err = xe_heci_gsc_init(xe); if (err) return err; @@ -1046,10 +1053,6 @@ int xe_device_probe(struct xe_device *xe) if (err) goto err_unregister_display; - err = xe_sysctrl_init(xe); - if (err) - goto err_unregister_display; - err = xe_device_sysfs_init(xe); if (err) goto err_unregister_display; diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 5135e8e4093f..e0ae6fee2c0e 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -516,14 +516,6 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) } } -static int hw_error_info_init(struct xe_device *xe) -{ - if (xe->info.platform != XE_PVC) - return 0; - - return xe_drm_ras_init(xe); -} - /* * Process hardware errors during boot */ @@ -550,16 +542,11 @@ static void process_hw_errors(struct xe_device *xe) void xe_hw_error_init(struct xe_device *xe) { struct xe_tile *tile = xe_device_get_root_tile(xe); - int ret; if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) return; INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); - ret = hw_error_info_init(xe); - if (ret) - drm_err(&xe->drm, "Failed to initialize XE DRM RAS (%pe)\n", ERR_PTR(ret)); - process_hw_errors(xe); } diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 4548e5cb08b9..57ee0ed0d46c 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -4,6 +4,7 @@ */ #include "xe_device.h" +#include "xe_drm_ras.h" #include "xe_pm.h" #include "xe_printk.h" #include "xe_ras.h" @@ -221,3 +222,22 @@ int xe_ras_set_threshold(struct xe_device *xe, u32 severity, u32 component, u32 comp_to_str(counter.common.component), sev_to_str(counter.common.severity)); return 0; } + +/** + * xe_ras_init - Initialize Xe RAS + * @xe: xe device instance + * + * Initialize Xe RAS + */ +void xe_ras_init(struct xe_device *xe) +{ + int ret; + + if (xe->info.platform != XE_PVC) + return; + + ret = xe_drm_ras_init(xe); + if (ret) + drm_err(&xe->drm, "Failed to initialize xe_drm_ras %d\n", ret); +} + diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index d1f71b1de723..b6bc50863fa6 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -11,6 +11,7 @@ struct xe_device; struct xe_sysctrl_event_response; +void xe_ras_init(struct xe_device *xe); void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response); int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold); -- 2.43.0 From: Riana Tauro Add a flag to control xe drm_ras registration. Enable this flag for PVC and CRI to support exposing RAS error counters via netlink. Signed-off-by: Riana Tauro Reviewed-by: Raag Jadav info.is_dgfx = desc->is_dgfx; xe->info.has_cached_pt = desc->has_cached_pt; + xe->info.has_drm_ras = desc->has_drm_ras; xe->info.has_fan_control = desc->has_fan_control; /* runtime fusing may force flat_ccs to disabled later */ xe->info.has_flat_ccs = desc->has_flat_ccs; diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h index 5b85e2c24b7b..24d4a3d00517 100644 --- a/drivers/gpu/drm/xe/xe_pci_types.h +++ b/drivers/gpu/drm/xe/xe_pci_types.h @@ -40,6 +40,7 @@ struct xe_device_desc { u8 has_cached_pt:1; u8 has_display:1; + u8 has_drm_ras:1; u8 has_fan_control:1; u8 has_flat_ccs:1; u8 has_gsc_nvm:1; diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 57ee0ed0d46c..7464057839ec 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -233,7 +233,7 @@ void xe_ras_init(struct xe_device *xe) { int ret; - if (xe->info.platform != XE_PVC) + if (!xe->info.has_drm_ras) return; ret = xe_drm_ras_init(xe); -- 2.43.0