Add get-error-threshold command support which allows querying threshold value of an error. Threshold in RAS context means the number of errors the hardware is expected to accumulate before it raises them to software. This is to have a fine grained control over error notifications that are raised by the hardware. Signed-off-by: Raag Jadav --- v2: Document threshold definition (Riana) Return -EOPNOTSUPP on threshold callbacks absence (Riana) Cancel and free genlmsg on failure (Riana) --- Documentation/gpu/drm-ras.rst | 9 ++ Documentation/netlink/specs/drm_ras.yaml | 28 +++++- drivers/gpu/drm/drm_ras.c | 109 +++++++++++++++++++++++ drivers/gpu/drm/drm_ras_nl.c | 13 +++ drivers/gpu/drm/drm_ras_nl.h | 2 + include/drm/drm_ras.h | 15 ++++ include/uapi/drm/drm_ras.h | 2 + 7 files changed, 176 insertions(+), 2 deletions(-) diff --git a/Documentation/gpu/drm-ras.rst b/Documentation/gpu/drm-ras.rst index 4636e68f5678..dfa72e8becda 100644 --- a/Documentation/gpu/drm-ras.rst +++ b/Documentation/gpu/drm-ras.rst @@ -54,6 +54,8 @@ User space tools can: ``node-id`` and ``error-id`` as parameters. * Clear specific error counters with the ``clear-error-counter`` command, using both ``node-id`` and ``error-id`` as parameters. +* Query specific error threshold value with the ``get-error-threshold`` command, using both + ``node-id`` and ``error-id`` as parameters. YAML-based Interface -------------------- @@ -109,3 +111,10 @@ Example: Clear an error counter for a given node sudo ynl --family drm_ras --do clear-error-counter --json '{"node-id":0, "error-id":1}' None + +Example: Query threshold value of a given error + +.. code-block:: bash + + sudo ynl --family drm_ras --do get-error-threshold --json '{"node-id":0, "error-id":1}' + {'error-id': 1, 'error-name': 'error_name1', 'error-threshold': 16} diff --git a/Documentation/netlink/specs/drm_ras.yaml b/Documentation/netlink/specs/drm_ras.yaml index e113056f8c01..016d713069bb 100644 --- a/Documentation/netlink/specs/drm_ras.yaml +++ b/Documentation/netlink/specs/drm_ras.yaml @@ -8,8 +8,10 @@ doc: >- DRM RAS (Reliability, Availability, Serviceability) over Generic Netlink. Provides a standardized mechanism for DRM drivers to register "nodes" representing hardware/software components capable of reporting error counters. - Userspace tools can query the list of nodes or individual error counters - via the Generic Netlink interface. + Userspace tools can query the list of nodes or individual error counters or + their thresholds via the Generic Netlink interface. Threshold in RAS context + means the number of errors the hardware is expected to accumulate before it + raises them to software. definitions: - @@ -69,6 +71,10 @@ attribute-sets: name: error-value type: u32 doc: Current value of the requested error counter. + - + name: error-threshold + type: u32 + doc: Threshold value of the error counter. operations: list: @@ -124,3 +130,21 @@ operations: do: request: attributes: *id-attrs + - + name: get-error-threshold + doc: >- + Retrieve threshold value of an error. + The response includes the id, the name, and current threshold + value of the error. + attribute-set: error-counter-attrs + flags: [admin-perm] + do: + request: + attributes: + - node-id + - error-id + reply: + attributes: + - error-id + - error-name + - error-threshold diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c index 03db53d03329..87e57bd1e8ad 100644 --- a/drivers/gpu/drm/drm_ras.c +++ b/drivers/gpu/drm/drm_ras.c @@ -41,6 +41,10 @@ * Userspace must provide Node ID, Error ID. * Clears specific error counter of a node if supported. * + * 4. GET_ERROR_THRESHOLD: Query threshold value of an error. + * Userspace must provide Node ID and Error ID. + * Returns the threshold value of a specific error. + * * Node registration: * * - drm_ras_node_register(): Registers a new node and assigns @@ -72,6 +76,8 @@ * operation, fetching a counter value from a specific node. * - drm_ras_nl_clear_error_counter_doit(): Implements the CLEAR_ERROR_COUNTER doit * operation, clearing a counter value from a specific node. + * - drm_ras_nl_get_error_threshold_doit(): Implements the GET_ERROR_THRESHOLD doit + * operation, fetching the threshold value of a specific error. */ static DEFINE_XARRAY_ALLOC(drm_ras_xa); @@ -168,6 +174,25 @@ static int get_node_error_counter(u32 node_id, u32 error_id, return node->query_error_counter(node, error_id, name, value); } +static int get_node_error_threshold(u32 node_id, u32 error_id, + const char **name, u32 *value) +{ + struct drm_ras_node *node; + + node = xa_load(&drm_ras_xa, node_id); + if (!node) + return -ENOENT; + + if (!node->query_error_threshold) + return -EOPNOTSUPP; + + if (error_id < node->error_counter_range.first || + error_id > node->error_counter_range.last) + return -EINVAL; + + return node->query_error_threshold(node, error_id, name, value); +} + static int msg_reply_counter_value(struct sk_buff *msg, u32 error_id, const char *error_name, u32 value) { @@ -186,6 +211,24 @@ static int msg_reply_counter_value(struct sk_buff *msg, u32 error_id, value); } +static int msg_reply_threshold_value(struct sk_buff *msg, u32 error_id, + const char *error_name, u32 value) +{ + int ret; + + ret = nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, error_id); + if (ret) + return ret; + + ret = nla_put_string(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, + error_name); + if (ret) + return ret; + + return nla_put_u32(msg, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD, + value); +} + static int doit_reply_counter_value(struct genl_info *info, u32 node_id, u32 error_id) { @@ -222,6 +265,45 @@ static int doit_reply_counter_value(struct genl_info *info, u32 node_id, return genlmsg_reply(msg, info); } +static int doit_reply_threshold_value(struct genl_info *info, u32 node_id, + u32 error_id) +{ + struct sk_buff *msg; + struct nlattr *hdr; + const char *error_name; + u32 value; + int ret; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_iput(msg, info); + if (!hdr) { + nlmsg_free(msg); + return -EMSGSIZE; + } + + ret = get_node_error_threshold(node_id, error_id, + &error_name, &value); + if (ret) { + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + return ret; + } + + ret = msg_reply_threshold_value(msg, error_id, error_name, value); + if (ret) { + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); + return ret; + } + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); +} + /** * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters * @skb: Netlink message buffer @@ -355,6 +437,33 @@ int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb, return node->clear_error_counter(node, error_id); } +/** + * drm_ras_nl_get_error_threshold_doit() - Query threshold value of an error + * @skb: Netlink message buffer + * @info: Generic Netlink info containing attributes of the request + * + * Extracts the Node ID and Error ID from the netlink attributes and + * retrieves the threshold value of the corresponding error. Sends the + * result back to the requesting user via the standard Genl reply. + * + * Return: 0 on success, or negative errno on failure. + */ +int drm_ras_nl_get_error_threshold_doit(struct sk_buff *skb, + struct genl_info *info) +{ + u32 node_id, error_id; + + if (!info->attrs || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID) || + GENL_REQ_ATTR_CHECK(info, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID)) + return -EINVAL; + + node_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID]); + error_id = nla_get_u32(info->attrs[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID]); + + return doit_reply_threshold_value(info, node_id, error_id); +} + /** * drm_ras_node_register() - Register a new RAS node * @node: Node structure to register diff --git a/drivers/gpu/drm/drm_ras_nl.c b/drivers/gpu/drm/drm_ras_nl.c index dea1c1b2494e..ecec2041c758 100644 --- a/drivers/gpu/drm/drm_ras_nl.c +++ b/drivers/gpu/drm/drm_ras_nl.c @@ -28,6 +28,12 @@ static const struct nla_policy drm_ras_clear_error_counter_nl_policy[DRM_RAS_A_E [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, }; +/* DRM_RAS_CMD_GET_ERROR_THRESHOLD - do */ +static const struct nla_policy drm_ras_get_error_threshold_nl_policy[DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID + 1] = { + [DRM_RAS_A_ERROR_COUNTER_ATTRS_NODE_ID] = { .type = NLA_U32, }, + [DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID] = { .type = NLA_U32, }, +}; + /* Ops table for drm_ras */ static const struct genl_split_ops drm_ras_nl_ops[] = { { @@ -56,6 +62,13 @@ static const struct genl_split_ops drm_ras_nl_ops[] = { .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = DRM_RAS_CMD_GET_ERROR_THRESHOLD, + .doit = drm_ras_nl_get_error_threshold_doit, + .policy = drm_ras_get_error_threshold_nl_policy, + .maxattr = DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; struct genl_family drm_ras_nl_family __ro_after_init = { diff --git a/drivers/gpu/drm/drm_ras_nl.h b/drivers/gpu/drm/drm_ras_nl.h index a398643572a5..399280c2c6e1 100644 --- a/drivers/gpu/drm/drm_ras_nl.h +++ b/drivers/gpu/drm/drm_ras_nl.h @@ -20,6 +20,8 @@ int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb, struct genl_info *info); +int drm_ras_nl_get_error_threshold_doit(struct sk_buff *skb, + struct genl_info *info); extern struct genl_family drm_ras_nl_family; diff --git a/include/drm/drm_ras.h b/include/drm/drm_ras.h index f2a787bc4f64..7bb429d85f57 100644 --- a/include/drm/drm_ras.h +++ b/include/drm/drm_ras.h @@ -69,6 +69,21 @@ struct drm_ras_node { */ int (*clear_error_counter)(struct drm_ras_node *node, u32 error_id); + /** + * @query_error_threshold: + * + * This callback is used by drm-ras to query threshold value of a + * specific error. + * + * Driver should expect query_error_threshold() to be called with + * error_id from `error_counter_range.first` to + * `error_counter_range.last`. + * + * Returns: 0 on success, negative error code on failure. + */ + int (*query_error_threshold)(struct drm_ras_node *node, u32 error_id, + const char **name, u32 *val); + /** @priv: Driver private data */ void *priv; }; diff --git a/include/uapi/drm/drm_ras.h b/include/uapi/drm/drm_ras.h index 218a3ee86805..59530f987ba2 100644 --- a/include/uapi/drm/drm_ras.h +++ b/include/uapi/drm/drm_ras.h @@ -33,6 +33,7 @@ enum { DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_ID, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_NAME, DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_VALUE, + DRM_RAS_A_ERROR_COUNTER_ATTRS_ERROR_THRESHOLD, __DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX, DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX = (__DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX - 1) @@ -42,6 +43,7 @@ enum { DRM_RAS_CMD_LIST_NODES = 1, DRM_RAS_CMD_GET_ERROR_COUNTER, DRM_RAS_CMD_CLEAR_ERROR_COUNTER, + DRM_RAS_CMD_GET_ERROR_THRESHOLD, __DRM_RAS_CMD_MAX, DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1) -- 2.43.0