From patchwork Tue Aug 8 01:21:45 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [01/11] drm/xe/oa: Introduce OA uapi From: Ashutosh Dixit X-Patchwork-Id: 551681 Message-Id: <20230808012155.38531-2-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:45 -0700 OA uapi allows userspace to: * Read streams of performance counters written by hardware * Configure (and reconfigure) which sets of perf counters are captured as part of OA streams * Configure other properties (such as format and periodicity) of such captures. * Query associated parameters such as OA unit timestamp freq, oa_unit_id's for hw engines and OA ioctl version Signed-off-by: Ashutosh Dixit --- include/uapi/drm/xe_drm.h | 257 +++++++++++++++++++++++++++++++++++++- 1 file changed, 256 insertions(+), 1 deletion(-) diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 86f16d50e9ccc..b4ab07c285245 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -111,6 +111,9 @@ struct xe_user_extension { #define DRM_XE_WAIT_USER_FENCE 0x0b #define DRM_XE_VM_MADVISE 0x0c #define DRM_XE_EXEC_QUEUE_GET_PROPERTY 0x0d +#define DRM_XE_OA_OPEN 0x36 +#define DRM_XE_OA_ADD_CONFIG 0x37 +#define DRM_XE_OA_REMOVE_CONFIG 0x38 /* Must be kept compact -- no holes */ #define DRM_IOCTL_XE_DEVICE_QUERY DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_DEVICE_QUERY, struct drm_xe_device_query) @@ -127,6 +130,9 @@ struct xe_user_extension { #define DRM_IOCTL_XE_EXEC_QUEUE_SET_PROPERTY DRM_IOW(DRM_COMMAND_BASE + DRM_XE_EXEC_QUEUE_SET_PROPERTY, struct drm_xe_exec_queue_set_property) #define DRM_IOCTL_XE_WAIT_USER_FENCE DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence) #define DRM_IOCTL_XE_VM_MADVISE DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise) +#define DRM_IOCTL_XE_OA_OPEN DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OA_OPEN, struct drm_xe_oa_open_param) +#define DRM_IOCTL_XE_OA_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OA_ADD_CONFIG, struct drm_xe_oa_config) +#define DRM_IOCTL_XE_OA_REMOVE_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OA_REMOVE_CONFIG, __u64) /** * enum drm_xe_memory_class - Supported memory classes. @@ -261,7 +267,8 @@ struct drm_xe_query_config { #define XE_QUERY_CONFIG_GT_COUNT 4 #define XE_QUERY_CONFIG_MEM_REGION_COUNT 5 #define XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY 6 -#define XE_QUERY_CONFIG_NUM_PARAM (XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY + 1) +#define XE_QUERY_OA_IOCTL_VERSION 7 +#define XE_QUERY_CONFIG_NUM_PARAM (XE_QUERY_OA_IOCTL_VERSION + 1) /** @info: array of elements containing the config info */ __u64 info[]; }; @@ -298,6 +305,7 @@ struct drm_xe_query_gts { __u64 native_mem_regions; /* bit mask of instances from drm_xe_query_mem_usage */ __u64 slow_mem_regions; /* bit mask of instances from drm_xe_query_mem_usage */ __u64 inaccessible_mem_regions; /* bit mask of instances from drm_xe_query_mem_usage */ + __u64 oa_timestamp_freq; __u64 reserved[8]; } gts[]; }; @@ -753,6 +761,7 @@ struct drm_xe_engine_class_instance { __u16 engine_instance; __u16 gt_id; + __u16 oa_unit_id; }; struct drm_xe_exec_queue_create { @@ -1056,6 +1065,252 @@ struct drm_xe_vm_madvise { __u64 reserved[2]; }; +enum drm_xe_oa_format { + XE_OA_FORMAT_C4_B8 = 7, + + /* Gen8+ */ + XE_OA_FORMAT_A12, + XE_OA_FORMAT_A12_B8_C8, + XE_OA_FORMAT_A32u40_A4u32_B8_C8, + + /* DG2 */ + XE_OAR_FORMAT_A32u40_A4u32_B8_C8, + XE_OA_FORMAT_A24u40_A14u32_B8_C8, + + /* MTL OAM */ + XE_OAM_FORMAT_MPEC8u64_B8_C8, + XE_OAM_FORMAT_MPEC8u32_B8_C8, + + XE_OA_FORMAT_MAX /* non-ABI */ +}; + +enum drm_xe_oa_property_id { + /** + * Open the stream for a specific exec queue id (as used with + * drm_xe_exec). A stream opened for a specific exec queue id this + * way won't typically require root privileges. + */ + DRM_XE_OA_PROP_EXEC_QUEUE_ID = 1, + + /** + * A value of 1 requests the inclusion of raw OA unit reports as + * part of stream samples. + */ + DRM_XE_OA_PROP_SAMPLE_OA, + + /** + * The value specifies which set of OA unit metrics should be + * configured, defining the contents of any OA unit reports. + */ + DRM_XE_OA_PROP_OA_METRICS_SET, + + /** + * The value specifies the size and layout of OA unit reports. + */ + DRM_XE_OA_PROP_OA_FORMAT, + + /** + * Specifying this property implicitly requests periodic OA unit + * sampling and (at least on Haswell) the sampling frequency is derived + * from this exponent as follows: + * + * 80ns * 2^(period_exponent + 1) + */ + DRM_XE_OA_PROP_OA_EXPONENT, + + /** + * Specifying this property is only valid when specify a context to + * filter with DRM_XE_OA_PROP_ENGINE_ID. Specifying this property + * will hold preemption of the particular engine we want to gather + * performance data about. + */ + DRM_XE_OA_PROP_HOLD_PREEMPTION, + + /** + * Specifying this pins all contexts to the specified SSEU power + * configuration for the duration of the recording. + * + * This parameter's value is a pointer to a struct + * drm_xe_gem_context_param_sseu (TBD). + */ + DRM_XE_OA_PROP_GLOBAL_SSEU, + + /** + * This optional parameter specifies the timer interval in nanoseconds + * at which the xe driver will check the OA buffer for available data. + * Minimum allowed value is 100 microseconds. A default value is used by + * the driver if this parameter is not specified. Note that larger timer + * values will reduce cpu consumption during OA perf captures. However, + * excessively large values would potentially result in OA buffer + * overwrites as captures reach end of the OA buffer. + */ + DRM_XE_OA_PROP_POLL_OA_PERIOD, + + /** + * Multiple engines may be mapped to the same OA unit. The OA unit is + * identified by class:instance of any engine mapped to it. + * + * This parameter specifies the engine class and must be passed along + * with DRM_XE_OA_PROP_OA_ENGINE_INSTANCE. + */ + DRM_XE_OA_PROP_OA_ENGINE_CLASS, + + /** + * This parameter specifies the engine instance and must be passed along + * with DRM_XE_OA_PROP_OA_ENGINE_CLASS. + */ + DRM_XE_OA_PROP_OA_ENGINE_INSTANCE, + + DRM_XE_OA_PROP_MAX /* non-ABI */ +}; + +struct drm_xe_oa_open_param { + __u32 flags; +#define XE_OA_FLAG_FD_CLOEXEC BIT(0) +#define XE_OA_FLAG_FD_NONBLOCK BIT(1) +#define XE_OA_FLAG_DISABLED BIT(2) + + /** The number of u64 (id, value) pairs */ + __u32 num_properties; + + /** + * Pointer to array of u64 (id, value) pairs configuring the stream + * to open. + */ + __u64 properties_ptr; +}; + +/* + * Enable data capture for a stream that was either opened in a disabled state + * via I915_PERF_FLAG_DISABLED or was later disabled via + * I915_PERF_IOCTL_DISABLE. + * + * It is intended to be cheaper to disable and enable a stream than it may be + * to close and re-open a stream with the same configuration. + * + * It's undefined whether any pending data for the stream will be lost. + */ +#define XE_OA_IOCTL_ENABLE _IO('i', 0x0) + +/* + * Disable data capture for a stream. + * + * It is an error to try and read a stream that is disabled. + */ +#define XE_OA_IOCTL_DISABLE _IO('i', 0x1) + +/* + * Change metrics_set captured by a stream. + * + * If the stream is bound to a specific context, the configuration change + * will performed inline with that context such that it takes effect before + * the next execbuf submission. + * + * Returns the previously bound metrics set id, or a negative error code. + */ +#define XE_OA_IOCTL_CONFIG _IO('i', 0x2) + +struct drm_xe_oa_record_header { + __u32 type; + __u16 pad; + __u16 size; +}; + +enum drm_xe_oa_record_type { + /** + * Samples are the work horse record type whose contents are + * extensible and defined when opening an xe oa stream based on the + * given properties. + * + * Boolean properties following the naming convention + * DRM_XE_OA_SAMPLE_xyz_PROP request the inclusion of 'xyz' data in + * every sample. + * + * The order of these sample properties given by userspace has no + * affect on the ordering of data within a sample. The order is + * documented here. + * + * struct { + * struct drm_xe_oa_record_header header; + * + * { u32 oa_report[]; } && DRM_XE_OA_PROP_SAMPLE_OA + * }; + */ + DRM_XE_OA_RECORD_SAMPLE = 1, + + /** + * Indicates that one or more OA reports were not written by the + * hardware. This can happen for example if an MI_REPORT_PERF_COUNT + * command collides with periodic sampling - which would be more likely + * at higher sampling frequencies. + */ + DRM_XE_OA_RECORD_OA_REPORT_LOST = 2, + + /** + * An error occurred that resulted in all pending OA reports being lost. + */ + DRM_XE_OA_RECORD_OA_BUFFER_LOST = 3, + + DRM_XE_OA_RECORD_MAX /* non-ABI */ +}; + +struct drm_xe_oa_config { + /** + * @uuid: + * + * String formatted like "%\08x-%\04x-%\04x-%\04x-%\012x" + */ + char uuid[36]; + + /** + * @n_mux_regs: + * + * Number of mux regs in &mux_regs_ptr. + */ + __u32 n_mux_regs; + + /** + * @n_boolean_regs: + * + * Number of boolean regs in &boolean_regs_ptr. + */ + __u32 n_boolean_regs; + + /** + * @n_flex_regs: + * + * Number of flex regs in &flex_regs_ptr. + */ + __u32 n_flex_regs; + + /** + * @mux_regs_ptr: + * + * Pointer to tuples of u32 values (register address, value) for mux + * registers. Expected length of buffer is (2 * sizeof(u32) * + * &n_mux_regs). + */ + __u64 mux_regs_ptr; + + /** + * @boolean_regs_ptr: + * + * Pointer to tuples of u32 values (register address, value) for mux + * registers. Expected length of buffer is (2 * sizeof(u32) * + * &n_boolean_regs). + */ + __u64 boolean_regs_ptr; + + /** + * @flex_regs_ptr: + * + * Pointer to tuples of u32 values (register address, value) for mux + * registers. Expected length of buffer is (2 * sizeof(u32) * + * &n_flex_regs). + */ + __u64 flex_regs_ptr; +}; + #if defined(__cplusplus) } #endif From patchwork Tue Aug 8 01:21:46 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [02/11] drm/xe/oa: Add OA types From: Ashutosh Dixit X-Patchwork-Id: 551687 Message-Id: <20230808012155.38531-3-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:46 -0700 Add types and data structs used by OA. The data structs maintain device and gt level information, information about the open OA stream and OA buffer used internally to capture OA counters written by HW as well as capture configurations which can be selected for an OA stream. v2: Add linux includes to fix build Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_oa_types.h | 295 +++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 drivers/gpu/drm/xe/xe_oa_types.h diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h new file mode 100644 index 0000000000000..58164ff0b6a48 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_oa_types.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_OA_TYPES_H_ +#define _XE_OA_TYPES_H__ + +#include +#include +#include +#include +#include +#include "regs/xe_reg_defs.h" + +struct drm_device; +struct drm_file; + +enum { + OA_GROUP_OAG = 0, + OA_GROUP_OAM_SAMEDIA_0 = 0, + + OA_GROUP_MAX, + OA_GROUP_INVALID = U32_MAX, +}; + +enum oa_type { + TYPE_OAG, + TYPE_OAM, +}; + +enum report_header { + HDR_32_BIT = 0, + HDR_64_BIT, +}; + +struct xe_oa_format { + u32 format; + int size; + int type; + enum report_header header; +}; + +struct xe_oa_reg { + struct xe_reg addr; + u32 value; +}; + +struct xe_oa_config { + struct xe_oa *oa; + + char uuid[UUID_STRING_LEN + 1]; + int id; + + const struct xe_oa_reg *mux_regs; + u32 mux_regs_len; + const struct xe_oa_reg *b_counter_regs; + u32 b_counter_regs_len; + const struct xe_oa_reg *flex_regs; + u32 flex_regs_len; + + struct attribute_group sysfs_metric; + struct attribute *attrs[2]; + struct kobj_attribute sysfs_metric_id; + + struct kref ref; + struct rcu_head rcu; +}; + +struct xe_oa_regs { + u32 base; + struct xe_reg oa_head_ptr; + struct xe_reg oa_tail_ptr; + struct xe_reg oa_buffer; + struct xe_reg oa_ctx_ctrl; + struct xe_reg oa_ctrl; + struct xe_reg oa_debug; + struct xe_reg oa_status; + u32 oa_ctrl_counter_format_shift; +}; + +/** + * struct xe_oa_group - OA group representing one hardware OA unit + */ +struct xe_oa_group { + /** @oa_unit_id: identifier for the OA unit */ + u32 oa_unit_id; + + /** + * @exclusive_stream: The stream currently using the OA unit. This is + * sometimes accessed outside a syscall associated to its file + * descriptor. + */ + struct xe_oa_stream *exclusive_stream; + + /** @num_engines: number of engines using this OA unit */ + u32 num_engines; + + /** @regs: OA buffer register group for programming the OA unit */ + struct xe_oa_regs regs; + + /** @type: Type of OA unit - OAM, OAG etc. */ + enum oa_type type; +}; + +/** + * struct xe_oa_gt - OA per-gt information + */ +struct xe_oa_gt { + /** @lock: lock associated with anything below within this structure */ + struct mutex lock; + + /** @num_oa_groups: number of oa groups per gt */ + u32 num_oa_groups; + + /** @group: list of OA groups - one for each OA buffer */ + struct xe_oa_group *group; +}; + +/** + * struct xe_oa - OA device level information + */ +struct xe_oa { + /** @xe: back pointer to xe device */ + struct xe_device *xe; + + /** @metrics_kobj: kobj for metrics sysfs */ + struct kobject *metrics_kobj; + + /** + * @metrics_lock: lock associated with adding/modifying/removing OA + * configs in oa->metrics_idr. + */ + struct mutex metrics_lock; + + /** + * @metrics_idr: List of dynamic configurations (struct xe_oa_config) + */ + struct idr metrics_idr; + + /** @ctx_oactxctrl_offset: offset of OACTXCONTROL register in context image */ + u32 ctx_oactxctrl_offset; + + /** @oa_formats: tracks all OA formats across platforms */ + const struct xe_oa_format *oa_formats; + +#define FORMAT_MASK_SIZE DIV_ROUND_UP(XE_OA_FORMAT_MAX - 1, BITS_PER_LONG) + + /** @format_mask: tracks valid OA formats for a platform */ + unsigned long format_mask[FORMAT_MASK_SIZE]; + + /** @oa_unit_ids: tracks oa unit ids assigned across gt's */ + u32 oa_unit_ids; +}; + +/** + * struct xe_oa_stream - state for a single open stream FD + */ +struct xe_oa_stream { + /** @oa: xe_oa backpointer */ + struct xe_oa *oa; + + /** @gt: gt associated with the oa stream */ + struct xe_gt *gt; + + /** + * @hwe: hardware engine associated with this performance stream. + */ + struct xe_hw_engine *hwe; + + /** @lock: Lock associated with operations on stream */ + struct mutex lock; + + /** + * @sample: true when DRM_XE_OA_PROP_SAMPLE_OA is given when + * opening a stream, representing the contents of a single sample + * as read() by userspace. + */ + bool sample; + + /** + * @sample_size: Considering the configured contents of a sample + * combined with the required header size, this is the total size + * of a single sample record. + */ + int sample_size; + + /** + * @exec_q: %NULL if measuring system-wide across all exec_q's or a + * specific exec_q that is being monitored. + */ + struct xe_exec_queue *exec_q; + + /** + * @enabled: Whether the stream is currently enabled, considering + * whether the stream was opened in a disabled state and based + * on `XE_OA_IOCTL_ENABLE` and `XE_OA_IOCTL_DISABLE` calls. + */ + bool enabled; + + /** @oa_config: The OA configuration used by the stream */ + struct xe_oa_config *oa_config; + + /** + * @oa_config_bos: A list of struct i915_oa_config_bo allocated lazily + * each time @oa_config changes. + */ + struct llist_head oa_config_bos; + + /** @specific_ctx_id: id of the context used for filtering reports */ + u32 specific_ctx_id; + + /** @specific_ctx_id_mask: The mask used to masking specific_ctx_id bits */ + u32 specific_ctx_id_mask; + + /** + * @poll_check_timer: High resolution timer that will periodically + * check for data in the circular OA buffer for notifying userspace + * (e.g. during a read() or poll()). + */ + struct hrtimer poll_check_timer; + + /** + * @poll_wq: The wait queue that hrtimer callback wakes when it + * sees data ready to read in the circular OA buffer. + */ + wait_queue_head_t poll_wq; + + /** @pollin: Whether there is data available to read */ + bool pollin; + + /** @periodic: Whether periodic sampling is currently enabled */ + bool periodic; + + /** @period_exponent: The OA unit sampling frequency is derived from this */ + int period_exponent; + + /** @oa_buffer: State of the OA buffer */ + struct { + /** @format: data format */ + const struct xe_oa_format *format; + + /** @format: xe_bo backing the OA buffer */ + struct xe_bo *bo; + + /** @vaddr: mapped vaddr of the OA buffer */ + u8 *vaddr; + + /** @last_ctx_id: last context id for OA data added */ + u32 last_ctx_id; + + /** + * @ptr_lock: Locks reads and writes to all head/tail state + * + * Consider: the head and tail pointer state needs to be read + * consistently from a hrtimer callback (atomic context) and + * read() fop (user context) with tail pointer updates happening + * in atomic context and head updates in user context and the + * (unlikely) possibility of read() errors needing to reset all + * head/tail state. + * + * Note: Contention/performance aren't currently a significant + * concern here considering the relatively low frequency of + * hrtimer callbacks (5ms period) and that reads typically only + * happen in response to a hrtimer event and likely complete + * before the next callback. + * + * Note: This lock is not held *while* reading and copying data + * to userspace so the value of head observed in htrimer + * callbacks won't represent any partial consumption of data. + */ + spinlock_t ptr_lock; + + /** + * @head: Although we can always read back the head pointer register, + * we prefer to avoid trusting the HW state, just to avoid any + * risk that some hardware condition could somehow bump the + * head pointer unpredictably and cause us to forward the wrong + * OA buffer data to userspace. + */ + u32 head; + + /** + * @tail: The last verified tail that can be read by userspace. + */ + u32 tail; + } oa_buffer; + + /** + * @poll_oa_period: The period in nanoseconds at which the OA + * buffer should be checked for available data. + */ + u64 poll_oa_period; +}; +#endif From patchwork Tue Aug 8 01:21:47 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [03/11] drm/xe/oa: Add registers and GPU commands used by OA From: Ashutosh Dixit X-Patchwork-Id: 551682 Message-Id: <20230808012155.38531-4-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:47 -0700 Add registers and GPU commands used by OA in subsequent patches. The xe oa code programs OA units which generate performance data. The code also submits command buffers to change hardware engine context images and implement waits. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/regs/xe_engine_regs.h | 5 + drivers/gpu/drm/xe/regs/xe_gpu_commands.h | 27 ++++ drivers/gpu/drm/xe/regs/xe_oa_regs.h | 173 ++++++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 drivers/gpu/drm/xe/regs/xe_oa_regs.h diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 79873bf64e8dd..044a4920f1568 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -84,6 +84,9 @@ #define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8) +#define MI_PREDICATE_RESULT_2(base) XE_REG((base) + 0x3bc) +#define MI_PREDICATE_RESULT_1(base) XE_REG((base) + 0x41c) + #define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4) #define RING_FORCE_TO_NONPRIV_DENY REG_BIT(30) #define RING_FORCE_TO_NONPRIV_ACCESS_MASK REG_GENMASK(29, 28) @@ -108,6 +111,8 @@ #define RING_EXECLIST_CONTROL(base) XE_REG((base) + 0x550) #define EL_CTRL_LOAD REG_BIT(0) +#define GEN8_RING_CS_GPR(base, n) XE_REG((base) + 0x600 + (n) * 8) + #define VDBOX_CGCTL3F10(base) XE_REG((base) + 0x3f10) #define IECPUNIT_CLKGATE_DIS REG_BIT(22) diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h index 12120dd37aa2a..672100d375312 100644 --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h @@ -14,8 +14,10 @@ #define MI_INSTR(opcode, flags) \ (__INSTR(INSTR_MI_CLIENT) | (opcode) << 23 | (flags)) +#define MI_OPCODE(x) (((x) >> 23) & 0x3f) #define MI_NOOP MI_INSTR(0, 0) +#define MI_SET_PREDICATE MI_INSTR(0x01, 0) #define MI_USER_INTERRUPT MI_INSTR(0x02, 0) #define MI_ARB_ON_OFF MI_INSTR(0x08, 0) @@ -23,12 +25,32 @@ #define MI_ARB_DISABLE (0<<0) #define MI_BATCH_BUFFER_END MI_INSTR(0x0a, 0) + +#define MI_MATH(x) MI_INSTR(0x1a, (x) - 1) +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2)) +#define MI_MATH_LOAD(op1, op2) MI_MATH_INSTR(0x080, op1, op2) +#define MI_MATH_ADD MI_MATH_INSTR(0x100, 0x0, 0x0) +#define MI_MATH_SUB MI_MATH_INSTR(0x101, 0x0, 0x0) +#define MI_MATH_STORE(op1, op2) MI_MATH_INSTR(0x180, op1, op2) +#define MI_MATH_STOREINV(op1, op2) MI_MATH_INSTR(0x580, op1, op2) +#define MI_MATH_REG(x) (x) +#define MI_MATH_REG_SRCA 0x20 +#define MI_MATH_REG_SRCB 0x21 +#define MI_MATH_REG_ACCU 0x31 +#define MI_MATH_REG_CF 0x33 + #define MI_STORE_DATA_IMM MI_INSTR(0x20, 0) +#define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2) #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) #define MI_LRI_LRM_CS_MMIO REG_BIT(19) #define MI_LRI_MMIO_REMAP_EN REG_BIT(17) #define MI_LRI_FORCE_POSTED (1<<12) +#define IS_MI_LRI_CMD(x) (MI_OPCODE(x) == MI_OPCODE(MI_INSTR(0x22, 0))) +#define MI_LRI_LEN(x) (((x) & 0xff) + 1) + +#define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1) +#define MI_SRM_LRM_GLOBAL_GTT REG_BIT(22) #define MI_FLUSH_DW MI_INSTR(0x26, 1) #define MI_FLUSH_DW_STORE_INDEX (1<<21) @@ -37,7 +59,12 @@ #define MI_FLUSH_DW_OP_STOREDW (1<<14) #define MI_FLUSH_DW_USE_GTT (1<<2) +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) + +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) + #define MI_BATCH_BUFFER_START MI_INSTR(0x31, 1) +#define MI_BATCH_PREDICATE REG_BIT(15) /* HSW+ on RCS only*/ #define XY_CTRL_SURF_COPY_BLT ((2 << 29) | (0x48 << 22) | 3) #define SRC_ACCESS_TYPE_SHIFT 21 diff --git a/drivers/gpu/drm/xe/regs/xe_oa_regs.h b/drivers/gpu/drm/xe/regs/xe_oa_regs.h new file mode 100644 index 0000000000000..0b378cb7a6ddb --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_oa_regs.h @@ -0,0 +1,173 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef __XE_OA_REGS__ +#define __XE_OA_REGS__ + +#define REG_EQUAL(reg, xe_reg) ((reg) == (xe_reg.addr)) +#define REG_EQUAL_MCR(reg, xe_reg) ((reg) == (xe_reg.__reg.addr)) + +#define HALF_SLICE_CHICKEN2 XE_REG_MCR(0xe180) +#define GEN8_ST_PO_DISABLE REG_BIT(13) + +#define GEN7_ROW_CHICKEN2 XE_REG(0xe4f4) +#define GEN8_ROW_CHICKEN XE_REG_MCR(0xe4f0) +#define STALL_DOP_GATING_DISABLE REG_BIT(5) +#define GEN12_DISABLE_DOP_GATING REG_BIT(0) + +#define RPM_CONFIG1 XE_REG(0xd04) +#define GEN10_GT_NOA_ENABLE REG_BIT(9) + +#define WAIT_FOR_RC6_EXIT XE_REG(0x20cc) +#define HSW_WAIT_FOR_RC6_EXIT_ENABLE REG_BIT(0) + +#define EU_PERF_CNTL0 XE_REG(0xe458) +#define EU_PERF_CNTL4 XE_REG(0xe45c) +#define EU_PERF_CNTL1 XE_REG(0xe558) +#define EU_PERF_CNTL5 XE_REG(0xe55c) +#define EU_PERF_CNTL2 XE_REG(0xe658) +#define EU_PERF_CNTL6 XE_REG(0xe65c) +#define EU_PERF_CNTL3 XE_REG(0xe758) + +#define OABUFFER_SIZE_MASK REG_GENMASK(5, 3) +#define OABUFFER_SIZE_128K REG_FIELD_PREP(OABUFFER_SIZE_MASK, 0) +#define OABUFFER_SIZE_256K REG_FIELD_PREP(OABUFFER_SIZE_MASK, 1) +#define OABUFFER_SIZE_512K REG_FIELD_PREP(OABUFFER_SIZE_MASK, 2) +#define OABUFFER_SIZE_1M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 3) +#define OABUFFER_SIZE_2M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 4) +#define OABUFFER_SIZE_4M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 5) +#define OABUFFER_SIZE_8M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 6) +#define OABUFFER_SIZE_16M REG_FIELD_PREP(OABUFFER_SIZE_MASK, 7) + +#define GEN12_OA_TLB_INV_CR XE_REG(0xceec) + +/* Gen12 OAR unit */ +#define GEN12_OAR_OACONTROL XE_REG(0x2960) +#define GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1 +#define GEN12_OAR_OACONTROL_COUNTER_ENABLE REG_BIT(0) + +#define GEN8_OACTXCONTROL XE_REG(0x2360) +#define GEN8_OA_COUNTER_RESUME REG_BIT(0) + +#define GEN12_OACTXCONTROL(base) XE_REG((base) + 0x360) +#define GEN12_OAR_OASTATUS XE_REG(0x2968) + +/* Gen12 OAG unit */ +#define GEN12_OAG_OAHEADPTR XE_REG(0xdb00) +#define GEN12_OAG_OAHEADPTR_MASK 0xffffffc0 +#define GEN12_OAG_OATAILPTR XE_REG(0xdb04) +#define GEN12_OAG_OATAILPTR_MASK 0xffffffc0 + +#define GEN12_OAG_OABUFFER XE_REG(0xdb08) +#define GEN12_OAG_OABUFFER_BUFFER_SIZE_MASK (0x7) +#define GEN12_OAG_OABUFFER_BUFFER_SIZE_SHIFT (3) +#define GEN12_OAG_OABUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */ + +#define GEN12_OAG_OAGLBCTXCTRL XE_REG(0x2b28) +#define GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT 2 +#define GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE REG_BIT(1) +#define GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME REG_BIT(0) + +#define GEN12_OAG_OACONTROL XE_REG(0xdaf4) +#define GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT 2 +#define GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE REG_BIT(0) + +#define GEN12_OAG_OA_DEBUG XE_REG(0xdaf8) +#define GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO REG_BIT(6) +#define GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS REG_BIT(5) +#define GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS REG_BIT(2) +#define GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1) + +#define GEN12_OAG_OASTATUS XE_REG(0xdafc) +#define GEN12_OAG_OASTATUS_COUNTER_OVERFLOW REG_BIT(2) +#define GEN12_OAG_OASTATUS_BUFFER_OVERFLOW REG_BIT(1) +#define GEN12_OAG_OASTATUS_REPORT_LOST REG_BIT(0) + +#define GDT_CHICKEN_BITS XE_REG(0x9840) +#define GT_NOA_ENABLE 0x00000080 + +#define GEN12_SQCNT1 XE_REG(0x8718) +#define GEN12_SQCNT1_PMON_ENABLE REG_BIT(30) +#define GEN12_SQCNT1_OABPC REG_BIT(29) + +/* Gen12 OAM unit */ +#define GEN12_OAM_HEAD_POINTER_OFFSET (0x1a0) +#define GEN12_OAM_HEAD_POINTER_MASK 0xffffffc0 + +#define GEN12_OAM_TAIL_POINTER_OFFSET (0x1a4) +#define GEN12_OAM_TAIL_POINTER_MASK 0xffffffc0 + +#define GEN12_OAM_BUFFER_OFFSET (0x1a8) +#define GEN12_OAM_BUFFER_SIZE_MASK (0x7) +#define GEN12_OAM_BUFFER_SIZE_SHIFT (3) +#define GEN12_OAM_BUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */ + +#define GEN12_OAM_CONTEXT_CONTROL_OFFSET (0x1bc) +#define GEN12_OAM_CONTEXT_CONTROL_TIMER_PERIOD_SHIFT 2 +#define GEN12_OAM_CONTEXT_CONTROL_TIMER_ENABLE REG_BIT(1) +#define GEN12_OAM_CONTEXT_CONTROL_COUNTER_RESUME REG_BIT(0) + +#define GEN12_OAM_CONTROL_OFFSET (0x194) +#define GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT 1 +#define GEN12_OAM_CONTROL_COUNTER_ENABLE REG_BIT(0) + +#define GEN12_OAM_DEBUG_OFFSET (0x198) +#define GEN12_OAM_DEBUG_BUFFER_SIZE_SELECT REG_BIT(12) +#define GEN12_OAM_DEBUG_INCLUDE_CLK_RATIO REG_BIT(6) +#define GEN12_OAM_DEBUG_DISABLE_CLK_RATIO_REPORTS REG_BIT(5) +#define GEN12_OAM_DEBUG_DISABLE_GO_1_0_REPORTS REG_BIT(2) +#define GEN12_OAM_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1) + +#define GEN12_OAM_STATUS_OFFSET (0x19c) +#define GEN12_OAM_STATUS_COUNTER_OVERFLOW REG_BIT(2) +#define GEN12_OAM_STATUS_BUFFER_OVERFLOW REG_BIT(1) +#define GEN12_OAM_STATUS_REPORT_LOST REG_BIT(0) + +#define GEN12_OAM_MMIO_TRG_OFFSET (0x1d0) + +#define GEN12_OAM_MMIO_TRG(base) \ + XE_REG((base) + GEN12_OAM_MMIO_TRG_OFFSET) + +#define GEN12_OAM_HEAD_POINTER(base) \ + XE_REG((base) + GEN12_OAM_HEAD_POINTER_OFFSET) +#define GEN12_OAM_TAIL_POINTER(base) \ + XE_REG((base) + GEN12_OAM_TAIL_POINTER_OFFSET) +#define GEN12_OAM_BUFFER(base) \ + XE_REG((base) + GEN12_OAM_BUFFER_OFFSET) +#define GEN12_OAM_CONTEXT_CONTROL(base) \ + XE_REG((base) + GEN12_OAM_CONTEXT_CONTROL_OFFSET) +#define GEN12_OAM_CONTROL(base) \ + XE_REG((base) + GEN12_OAM_CONTROL_OFFSET) +#define GEN12_OAM_DEBUG(base) \ + XE_REG((base) + GEN12_OAM_DEBUG_OFFSET) +#define GEN12_OAM_STATUS(base) \ + XE_REG((base) + GEN12_OAM_STATUS_OFFSET) + +#define GEN12_OAM_CEC0_0_OFFSET (0x40) +#define GEN12_OAM_CEC7_1_OFFSET (0x7c) +#define GEN12_OAM_CEC0_0(base) \ + XE_REG((base) + GEN12_OAM_CEC0_0_OFFSET) +#define GEN12_OAM_CEC7_1(base) \ + XE_REG((base) + GEN12_OAM_CEC7_1_OFFSET) + +#define GEN12_OAM_STARTTRIG1_OFFSET (0x00) +#define GEN12_OAM_STARTTRIG8_OFFSET (0x1c) +#define GEN12_OAM_STARTTRIG1(base) \ + XE_REG((base) + GEN12_OAM_STARTTRIG1_OFFSET) +#define GEN12_OAM_STARTTRIG8(base) \ + XE_REG((base) + GEN12_OAM_STARTTRIG8_OFFSET) + +#define GEN12_OAM_REPORTTRIG1_OFFSET (0x20) +#define GEN12_OAM_REPORTTRIG8_OFFSET (0x3c) +#define GEN12_OAM_REPORTTRIG1(base) \ + XE_REG((base) + GEN12_OAM_REPORTTRIG1_OFFSET) +#define GEN12_OAM_REPORTTRIG8(base) \ + XE_REG((base) + GEN12_OAM_REPORTTRIG8_OFFSET) + +#define GEN12_OAM_PERF_COUNTER_B0_OFFSET (0x84) +#define GEN12_OAM_PERF_COUNTER_B(base, idx) \ + XE_REG((base) + GEN12_OAM_PERF_COUNTER_B0_OFFSET + 4 * (idx)) + +#endif /* __XE_OA_REGS__ */ From patchwork Tue Aug 8 01:21:48 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [04/11] drm/xe/oa: Module init/exit and probe/remove From: Ashutosh Dixit X-Patchwork-Id: 551690 Message-Id: <20230808012155.38531-5-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:48 -0700 Perform OA initialization at module init and probe time: * Setup perf_stream_paranoid and oa_max_sample_rate files in /proc * Setup metrics sysfs directories to expose which metrics configurations are available * Setup OA groups which associate hw engines with OA units * Initialize OA units Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_device.c | 11 + drivers/gpu/drm/xe/xe_device_types.h | 4 + drivers/gpu/drm/xe/xe_gt_types.h | 4 + drivers/gpu/drm/xe/xe_hw_engine_types.h | 2 + drivers/gpu/drm/xe/xe_module.c | 5 + drivers/gpu/drm/xe/xe_oa.c | 310 ++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_oa.h | 18 ++ 8 files changed, 355 insertions(+) create mode 100644 drivers/gpu/drm/xe/xe_oa.c create mode 100644 drivers/gpu/drm/xe/xe_oa.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 1b59702cd9f98..01280233ff271 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -84,6 +84,7 @@ xe-y += xe_bb.o \ xe_mmio.o \ xe_mocs.o \ xe_module.o \ + xe_oa.o \ xe_pat.o \ xe_pci.o \ xe_pcode.o \ diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 766df07de979c..1c54cac0a117f 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -25,6 +25,7 @@ #include "xe_irq.h" #include "xe_mmio.h" #include "xe_module.h" +#include "xe_oa.h" #include "xe_pcode.h" #include "xe_pm.h" #include "xe_query.h" @@ -323,6 +324,10 @@ int xe_device_probe(struct xe_device *xe) goto err_irq_shutdown; } + err = xe_oa_init(xe); + if (err) + goto err_irq_shutdown; + err = xe_display_init(xe); if (err) goto err_fini_display; @@ -333,6 +338,8 @@ int xe_device_probe(struct xe_device *xe) xe_display_register(xe); + xe_oa_register(xe); + xe_debugfs_register(xe); err = drmm_add_action_or_reset(&xe->drm, xe_device_sanitize, xe); @@ -361,10 +368,14 @@ static void xe_device_remove_display(struct xe_device *xe) void xe_device_remove(struct xe_device *xe) { + xe_oa_unregister(xe); + xe_device_remove_display(xe); xe_display_unlink(xe); + xe_oa_fini(xe); + xe_irq_shutdown(xe); } diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index f84ecb976f5d4..3b487905306b7 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -16,6 +16,7 @@ #include "xe_gt_types.h" #include "xe_platform_types.h" #include "xe_step_types.h" +#include "xe_oa.h" #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) #include "ext/intel_device_info.h" @@ -376,6 +377,9 @@ struct xe_device { */ struct task_struct *pm_callback_task; + /** @oa: oa perf counter subsystem */ + struct xe_oa oa; + /* private: */ #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 35b8c19fa8bf5..d6053f85dbb60 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -13,6 +13,7 @@ #include "xe_reg_sr_types.h" #include "xe_sa_types.h" #include "xe_uc_types.h" +#include "xe_oa.h" struct xe_exec_queue_ops; struct xe_migrate; @@ -346,6 +347,9 @@ struct xe_gt { /** @oob: bitmap with active OOB workaroudns */ unsigned long *oob; } wa_active; + + /** @oa: oa perf counter subsystem per gt info */ + struct xe_oa_gt oa; }; #endif diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h index 97d9ba31b5fc7..92bb30433353c 100644 --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h @@ -144,6 +144,8 @@ struct xe_hw_engine { enum xe_hw_engine_id engine_id; /** @eclass: pointer to per hw engine class interface */ struct xe_hw_engine_class_intf *eclass; + /** @oa_group: oa unit for this hw engine */ + struct xe_oa_group *oa_group; }; /** diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c index de85494e2280b..460e8161c6f21 100644 --- a/drivers/gpu/drm/xe/xe_module.c +++ b/drivers/gpu/drm/xe/xe_module.c @@ -11,6 +11,7 @@ #include "xe_drv.h" #include "xe_hw_fence.h" #include "xe_module.h" +#include "xe_oa.h" #include "xe_pci.h" #include "xe_sched_job.h" @@ -53,6 +54,10 @@ static const struct init_funcs init_funcs[] = { .init = xe_register_pci_driver, .exit = xe_unregister_pci_driver, }, + { + .init = xe_oa_sysctl_register, + .exit = xe_oa_sysctl_unregister, + }, }; static int __init xe_init(void) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c new file mode 100644 index 0000000000000..d44ef611c76eb --- /dev/null +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2023 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include + +#include "regs/xe_oa_regs.h" +#include "xe_gt.h" +#include "xe_device.h" +#include "xe_oa.h" + +static u32 xe_oa_stream_paranoid = true; +static int xe_oa_sample_rate_hard_limit; +static u32 xe_oa_max_sample_rate = 100000; + +static const struct xe_oa_format oa_formats[] = { + [XE_OA_FORMAT_C4_B8] = { 7, 64 }, + [XE_OA_FORMAT_A12] = { 0, 64 }, + [XE_OA_FORMAT_A12_B8_C8] = { 2, 128 }, + [XE_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [XE_OAR_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [XE_OA_FORMAT_A24u40_A14u32_B8_C8] = { 5, 256 }, + [XE_OAM_FORMAT_MPEC8u64_B8_C8] = { 1, 192, TYPE_OAM, HDR_64_BIT }, + [XE_OAM_FORMAT_MPEC8u32_B8_C8] = { 2, 128, TYPE_OAM, HDR_64_BIT }, +}; + +static struct ctl_table_header *sysctl_header; + +void xe_oa_register(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + + if (!oa->xe) + return; + + oa->metrics_kobj = kobject_create_and_add("metrics", + &xe->drm.primary->kdev->kobj); +} + +void xe_oa_unregister(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + + if (!oa->metrics_kobj) + return; + + kobject_put(oa->metrics_kobj); + oa->metrics_kobj = NULL; +} + +static u32 num_oa_groups_per_gt(struct xe_gt *gt) +{ + return 1; +} + +static u32 __oam_engine_group(struct xe_hw_engine *hwe) +{ + if (GRAPHICS_VERx100(gt_to_xe(hwe->gt)) >= 1270) { + /* + * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices + * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA. + */ + drm_WARN_ON(&hwe->gt->tile->xe->drm, + hwe->gt->info.type != XE_GT_TYPE_MEDIA); + + return OA_GROUP_OAM_SAMEDIA_0; + } + + return OA_GROUP_INVALID; +} + +static u32 __oa_engine_group(struct xe_hw_engine *hwe) +{ + switch (hwe->class) { + case XE_ENGINE_CLASS_RENDER: + return OA_GROUP_OAG; + + case XE_ENGINE_CLASS_VIDEO_DECODE: + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + return __oam_engine_group(hwe); + + default: + return OA_GROUP_INVALID; + } +} + +static struct xe_oa_regs __oam_regs(u32 base) +{ + return (struct xe_oa_regs) { + base, + GEN12_OAM_HEAD_POINTER(base), + GEN12_OAM_TAIL_POINTER(base), + GEN12_OAM_BUFFER(base), + GEN12_OAM_CONTEXT_CONTROL(base), + GEN12_OAM_CONTROL(base), + GEN12_OAM_DEBUG(base), + GEN12_OAM_STATUS(base), + GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT, + }; +} + +static struct xe_oa_regs __oag_regs(void) +{ + return (struct xe_oa_regs) { + 0, + GEN12_OAG_OAHEADPTR, + GEN12_OAG_OATAILPTR, + GEN12_OAG_OABUFFER, + GEN12_OAG_OAGLBCTXCTRL, + GEN12_OAG_OACONTROL, + GEN12_OAG_OA_DEBUG, + GEN12_OAG_OASTATUS, + GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT, + }; +} + +static void xe_oa_init_groups(struct xe_gt *gt) +{ + const u32 mtl_oa_base[] = { + [OA_GROUP_OAM_SAMEDIA_0] = 0x393000, + }; + int i, num_groups = gt->oa.num_oa_groups; + + for (i = 0; i < num_groups; i++) { + struct xe_oa_group *g = >->oa.group[i]; + + /* Fused off engines can result in a group with num_engines == 0 */ + if (g->num_engines == 0) + continue; + + if (i == OA_GROUP_OAG && gt->info.type != XE_GT_TYPE_MEDIA) { + g->regs = __oag_regs(); + g->type = TYPE_OAG; + } else if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) { + g->regs = __oam_regs(mtl_oa_base[i]); + g->type = TYPE_OAM; + } + + /* Set oa_unit_ids now to ensure ids remain contiguous. */ + g->oa_unit_id = gt->tile->xe->oa.oa_unit_ids++; + } +} + +static int xe_oa_init_gt(struct xe_gt *gt) +{ + u32 num_groups = num_oa_groups_per_gt(gt); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_oa_group *g; + + g = kcalloc(num_groups, sizeof(*g), GFP_KERNEL); + if (!g) + return -ENOMEM; + + for_each_hw_engine(hwe, gt, id) { + u32 index = __oa_engine_group(hwe); + + hwe->oa_group = NULL; + if (index < num_groups) { + g[index].num_engines++; + hwe->oa_group = &g[index]; + } + } + + gt->oa.num_oa_groups = num_groups; + gt->oa.group = g; + + xe_oa_init_groups(gt); + + return 0; +} + +static int xe_oa_init_engine_groups(struct xe_oa *oa) +{ + struct xe_gt *gt; + int i, ret; + + for_each_gt(gt, oa->xe, i) { + ret = xe_oa_init_gt(gt); + if (ret) + return ret; + } + + return 0; +} + +static void oa_format_add(struct xe_oa *oa, enum drm_xe_oa_format format) +{ + __set_bit(format, oa->format_mask); +} + +static void xe_oa_init_supported_formats(struct xe_oa *oa) +{ + switch (oa->xe->info.platform) { + case XE_ALDERLAKE_S: + case XE_ALDERLAKE_P: + oa_format_add(oa, XE_OA_FORMAT_A12); + oa_format_add(oa, XE_OA_FORMAT_A12_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_C4_B8); + break; + + case XE_DG2: + oa_format_add(oa, XE_OAR_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_A24u40_A14u32_B8_C8); + break; + + case XE_METEORLAKE: + oa_format_add(oa, XE_OAR_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_A24u40_A14u32_B8_C8); + oa_format_add(oa, XE_OAM_FORMAT_MPEC8u64_B8_C8); + oa_format_add(oa, XE_OAM_FORMAT_MPEC8u32_B8_C8); + break; + + default: + drm_err(&oa->xe->drm, "Unknown platform\n"); + } +} + +int xe_oa_init(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + struct xe_gt *gt; + int i, ret; + + /* Support OA only with GuC submission and Gen12+ */ + if (XE_WARN_ON(!xe_device_guc_submission_enabled(xe)) || + XE_WARN_ON(GRAPHICS_VER(xe) < 12)) + return 0; + + oa->xe = xe; + oa->oa_formats = oa_formats; + + for_each_gt(gt, xe, i) + mutex_init(>->oa.lock); + + /* Choose a representative limit */ + xe_oa_sample_rate_hard_limit = xe_root_mmio_gt(xe)->info.clock_freq / 2; + + mutex_init(&oa->metrics_lock); + idr_init_base(&oa->metrics_idr, 1); + + ret = xe_oa_init_engine_groups(oa); + if (ret) { + drm_err(&xe->drm, "OA initialization failed %d\n", ret); + return ret; + } + + xe_oa_init_supported_formats(oa); + + oa->xe = xe; + return 0; +} + +void xe_oa_fini(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + struct xe_gt *gt; + int i; + + if (!oa->xe) + return; + + for_each_gt(gt, xe, i) + kfree(gt->oa.group); + + idr_destroy(&oa->metrics_idr); + + oa->xe = NULL; +} + +static struct ctl_table oa_ctl_table[] = { + { + .procname = "perf_stream_paranoid", + .data = &xe_oa_stream_paranoid, + .maxlen = sizeof(xe_oa_stream_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "oa_max_sample_rate", + .data = &xe_oa_max_sample_rate, + .maxlen = sizeof(xe_oa_max_sample_rate), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &xe_oa_sample_rate_hard_limit, + }, + {} +}; + +int xe_oa_sysctl_register(void) +{ + sysctl_header = register_sysctl("dev/xe", oa_ctl_table); + return 0; +} + +void xe_oa_sysctl_unregister(void) +{ + unregister_sysctl_table(sysctl_header); +} diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h new file mode 100644 index 0000000000000..ba4ba80fd34cb --- /dev/null +++ b/drivers/gpu/drm/xe/xe_oa.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023 Intel Corporation + */ + +#ifndef _XE_OA_H_ +#define _XE_OA_H_ + +#include "xe_oa_types.h" + +int xe_oa_init(struct xe_device *xe); +void xe_oa_fini(struct xe_device *xe); +void xe_oa_register(struct xe_device *xe); +void xe_oa_unregister(struct xe_device *xe); +int xe_oa_sysctl_register(void); +void xe_oa_sysctl_unregister(void); + +#endif From patchwork Tue Aug 8 01:21:49 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [05/11] drm/xe/oa: Add/remove config ioctl's From: Ashutosh Dixit X-Patchwork-Id: 551689 Message-Id: <20230808012155.38531-6-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:49 -0700 OA configurations consist of a set of event and counter select registers. The add_config ioctl validates and stores such configurations and also exposes them in the metrics sysfs. These configurations will be programmed to OA unit HW when an OA stream using a configuration is opened. The OA stream can also switch to other stored configurations. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_device.c | 4 + drivers/gpu/drm/xe/xe_oa.c | 379 ++++++++++++++++++++++++++++++++- drivers/gpu/drm/xe/xe_oa.h | 5 + 3 files changed, 387 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 1c54cac0a117f..3772ce1dd234b 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -114,6 +114,10 @@ static const struct drm_ioctl_desc xe_ioctls[] = { DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(XE_VM_MADVISE, xe_vm_madvise_ioctl, DRM_RENDER_ALLOW), + + DRM_IOCTL_DEF_DRV(XE_OA_ADD_CONFIG, xe_oa_add_config_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_OA_REMOVE_CONFIG, xe_oa_remove_config_ioctl, DRM_RENDER_ALLOW), + }; static const struct file_operations xe_driver_fops = { diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index d44ef611c76eb..b98eeab8573d7 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -12,8 +12,8 @@ #include #include "regs/xe_oa_regs.h" -#include "xe_gt.h" #include "xe_device.h" +#include "xe_gt.h" #include "xe_oa.h" static u32 xe_oa_stream_paranoid = true; @@ -33,6 +33,376 @@ static const struct xe_oa_format oa_formats[] = { static struct ctl_table_header *sysctl_header; +static void xe_oa_config_release(struct kref *ref) +{ + struct xe_oa_config *oa_config = + container_of(ref, typeof(*oa_config), ref); + + kfree(oa_config->flex_regs); + kfree(oa_config->b_counter_regs); + kfree(oa_config->mux_regs); + + kfree_rcu(oa_config, rcu); +} + +static void xe_oa_config_put(struct xe_oa_config *oa_config) +{ + if (!oa_config) + return; + + kref_put(&oa_config->ref, xe_oa_config_release); +} + +static bool xe_oa_is_valid_flex_addr(struct xe_oa *oa, u32 addr) +{ + static const struct xe_reg flex_eu_regs[] = { + EU_PERF_CNTL0, + EU_PERF_CNTL1, + EU_PERF_CNTL2, + EU_PERF_CNTL3, + EU_PERF_CNTL4, + EU_PERF_CNTL5, + EU_PERF_CNTL6, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(flex_eu_regs); i++) { + if (flex_eu_regs[i].addr == addr) + return true; + } + return false; +} + +static bool xe_oa_reg_in_range_table(u32 addr, const struct xe_mmio_range *table) +{ + while (table->start || table->end) { + if (addr >= table->start && addr <= table->end) + return true; + + table++; + } + + return false; +} + +static const struct xe_mmio_range xehp_oa_b_counters[] = { + { .start = 0xdc48, .end = 0xdc48 }, /* OAA_ENABLE_REG */ + { .start = 0xdd00, .end = 0xdd48 }, /* OAG_LCE0_0 - OAA_LENABLE_REG */ + {} +}; + +static const struct xe_mmio_range gen12_oa_b_counters[] = { + { .start = 0x2b2c, .end = 0x2b2c }, /* GEN12_OAG_OA_PESS */ + { .start = 0xd900, .end = 0xd91c }, /* GEN12_OAG_OASTARTTRIG[1-8] */ + { .start = 0xd920, .end = 0xd93c }, /* GEN12_OAG_OAREPORTTRIG1[1-8] */ + { .start = 0xd940, .end = 0xd97c }, /* GEN12_OAG_CEC[0-7][0-1] */ + { .start = 0xdc00, .end = 0xdc3c }, /* GEN12_OAG_SCEC[0-7][0-1] */ + { .start = 0xdc40, .end = 0xdc40 }, /* GEN12_OAG_SPCTR_CNF */ + { .start = 0xdc44, .end = 0xdc44 }, /* GEN12_OAA_DBG_REG */ + {} +}; + +static const struct xe_mmio_range mtl_oam_b_counters[] = { + { .start = 0x393000, .end = 0x39301c }, /* GEN12_OAM_STARTTRIG1[1-8] */ + { .start = 0x393020, .end = 0x39303c }, /* GEN12_OAM_REPORTTRIG1[1-8] */ + { .start = 0x393040, .end = 0x39307c }, /* GEN12_OAM_CEC[0-7][0-1] */ + { .start = 0x393200, .end = 0x39323C }, /* MPES[0-7] */ + {} +}; + +static bool xe_oa_is_valid_b_counter_addr(struct xe_oa *oa, u32 addr) +{ + return xe_oa_reg_in_range_table(addr, xehp_oa_b_counters) || + xe_oa_reg_in_range_table(addr, gen12_oa_b_counters) || + xe_oa_reg_in_range_table(addr, mtl_oam_b_counters); +} + +/* + * Ref: 14010536224: + * 0x20cc is repurposed on MTL, so use a separate array for MTL. + */ +static const struct xe_mmio_range mtl_oa_mux_regs[] = { + { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ + { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ + { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ + { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ + { .start = 0x38d100, .end = 0x38d114}, /* VISACTL */ + {} +}; + +static const struct xe_mmio_range gen12_oa_mux_regs[] = { + { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ + { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ + { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ + { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ + { .start = 0x20cc, .end = 0x20cc }, /* WAIT_FOR_RC6_EXIT */ + {} +}; + +static bool xe_oa_is_valid_mux_addr(struct xe_oa *oa, u32 addr) +{ + if (oa->xe->info.platform == XE_METEORLAKE) + return xe_oa_reg_in_range_table(addr, mtl_oa_mux_regs); + else + return xe_oa_reg_in_range_table(addr, gen12_oa_mux_regs); +} + +static u32 mask_reg_value(u32 reg, u32 val) +{ + /* + * HALF_SLICE_CHICKEN2 is programmed with a the WaDisableSTUnitPowerOptimization + * workaround. Make sure the value programmed by userspace doesn't change this. + */ + if (REG_EQUAL_MCR(reg, HALF_SLICE_CHICKEN2)) + val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE); + + /* + * WAIT_FOR_RC6_EXIT has only one bit fullfilling the function indicated by its + * name and a bunch of selection fields used by OA configs. + */ + if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT)) + val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE); + + return val; +} + +static struct xe_oa_reg * +xe_oa_alloc_regs(struct xe_oa *oa, bool (*is_valid)(struct xe_oa *oa, u32 addr), + u32 __user *regs, u32 n_regs) +{ + struct xe_oa_reg *oa_regs; + int err; + u32 i; + + if (!n_regs || WARN_ON(!is_valid)) + return NULL; + + oa_regs = kmalloc_array(n_regs, sizeof(*oa_regs), GFP_KERNEL); + if (!oa_regs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < n_regs; i++) { + u32 addr, value; + + err = get_user(addr, regs); + if (err) + goto addr_err; + + if (!is_valid(oa, addr)) { + drm_dbg(&oa->xe->drm, "Invalid oa_reg address: %X\n", addr); + err = -EINVAL; + goto addr_err; + } + + err = get_user(value, regs + 1); + if (err) + goto addr_err; + + oa_regs[i].addr = XE_REG(addr); + oa_regs[i].value = mask_reg_value(addr, value); + + regs += 2; + } + + return oa_regs; + +addr_err: + kfree(oa_regs); + return ERR_PTR(err); +} + +static ssize_t show_dynamic_id(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_oa_config *oa_config = + container_of(attr, typeof(*oa_config), sysfs_metric_id); + + return sprintf(buf, "%d\n", oa_config->id); +} + +static int create_dynamic_oa_sysfs_entry(struct xe_oa *oa, + struct xe_oa_config *oa_config) +{ + sysfs_attr_init(&oa_config->sysfs_metric_id.attr); + oa_config->sysfs_metric_id.attr.name = "id"; + oa_config->sysfs_metric_id.attr.mode = 0444; + oa_config->sysfs_metric_id.show = show_dynamic_id; + oa_config->sysfs_metric_id.store = NULL; + + oa_config->attrs[0] = &oa_config->sysfs_metric_id.attr; + oa_config->attrs[1] = NULL; + + oa_config->sysfs_metric.name = oa_config->uuid; + oa_config->sysfs_metric.attrs = oa_config->attrs; + + return sysfs_create_group(oa->metrics_kobj, &oa_config->sysfs_metric); +} + +int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_oa *oa = &to_xe_device(dev)->oa; + struct drm_xe_oa_config *args = data; + struct xe_oa_config *oa_config, *tmp; + struct xe_oa_reg *regs; + int err, id; + + if (!oa->xe) { + drm_dbg(&oa->xe->drm, "xe oa interface not available for this system\n"); + return -ENODEV; + } + + if (xe_oa_stream_paranoid && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, "Insufficient privileges to add xe OA config\n"); + return -EACCES; + } + + if ((!args->mux_regs_ptr || !args->n_mux_regs) && + (!args->boolean_regs_ptr || !args->n_boolean_regs) && + (!args->flex_regs_ptr || !args->n_flex_regs)) { + drm_dbg(&oa->xe->drm, "No OA registers given\n"); + return -EINVAL; + } + + oa_config = kzalloc(sizeof(*oa_config), GFP_KERNEL); + if (!oa_config) + return -ENOMEM; + + oa_config->oa = oa; + kref_init(&oa_config->ref); + + if (!uuid_is_valid(args->uuid)) { + drm_dbg(&oa->xe->drm, "Invalid uuid format for OA config\n"); + err = -EINVAL; + goto reg_err; + } + + /* Last character in oa_config->uuid will be 0 because oa_config is kzalloc */ + memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid)); + + oa_config->mux_regs_len = args->n_mux_regs; + regs = xe_oa_alloc_regs(oa, xe_oa_is_valid_mux_addr, + u64_to_user_ptr(args->mux_regs_ptr), + args->n_mux_regs); + if (IS_ERR(regs)) { + drm_dbg(&oa->xe->drm, "Failed to create OA config for mux_regs\n"); + err = PTR_ERR(regs); + goto reg_err; + } + oa_config->mux_regs = regs; + + oa_config->b_counter_regs_len = args->n_boolean_regs; + regs = xe_oa_alloc_regs(oa, xe_oa_is_valid_b_counter_addr, + u64_to_user_ptr(args->boolean_regs_ptr), + args->n_boolean_regs); + if (IS_ERR(regs)) { + drm_dbg(&oa->xe->drm, "Failed to create OA config for b_counter_regs\n"); + err = PTR_ERR(regs); + goto reg_err; + } + oa_config->b_counter_regs = regs; + + oa_config->flex_regs_len = args->n_flex_regs; + regs = xe_oa_alloc_regs(oa, xe_oa_is_valid_flex_addr, + u64_to_user_ptr(args->flex_regs_ptr), + args->n_flex_regs); + if (IS_ERR(regs)) { + drm_dbg(&oa->xe->drm, "Failed to create OA config for flex_regs\n"); + err = PTR_ERR(regs); + goto reg_err; + } + oa_config->flex_regs = regs; + + err = mutex_lock_interruptible(&oa->metrics_lock); + if (err) + goto reg_err; + + /* We shouldn't have too many configs, so this iteration shouldn't be too costly */ + idr_for_each_entry(&oa->metrics_idr, tmp, id) { + if (!strcmp(tmp->uuid, oa_config->uuid)) { + drm_dbg(&oa->xe->drm, "OA config already exists with this uuid\n"); + err = -EADDRINUSE; + goto sysfs_err; + } + } + + err = create_dynamic_oa_sysfs_entry(oa, oa_config); + if (err) { + drm_dbg(&oa->xe->drm, "Failed to create sysfs entry for OA config\n"); + goto sysfs_err; + } + + /* Config id 0 is invalid, id 1 for kernel stored test config. */ + oa_config->id = idr_alloc(&oa->metrics_idr, oa_config, 2, 0, GFP_KERNEL); + if (oa_config->id < 0) { + drm_dbg(&oa->xe->drm, "Failed to create sysfs entry for OA config\n"); + err = oa_config->id; + goto sysfs_err; + } + + mutex_unlock(&oa->metrics_lock); + + drm_dbg(&oa->xe->drm, "Added config %s id=%i\n", oa_config->uuid, oa_config->id); + + return oa_config->id; + +sysfs_err: + mutex_unlock(&oa->metrics_lock); +reg_err: + xe_oa_config_put(oa_config); + drm_dbg(&oa->xe->drm, "Failed to add new OA config\n"); + return err; +} + +int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_oa *oa = &to_xe_device(dev)->oa; + struct xe_oa_config *oa_config; + u64 *arg = data; + int ret; + + if (!oa->xe) { + drm_dbg(&oa->xe->drm, "xe oa interface not available for this system\n"); + return -ENODEV; + } + + if (xe_oa_stream_paranoid && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, "Insufficient privileges to remove xe OA config\n"); + return -EACCES; + } + + ret = mutex_lock_interruptible(&oa->metrics_lock); + if (ret) + return ret; + + oa_config = idr_find(&oa->metrics_idr, *arg); + if (!oa_config) { + drm_dbg(&oa->xe->drm, "Failed to remove unknown OA config\n"); + ret = -ENOENT; + goto err_unlock; + } + + WARN_ON(*arg != oa_config->id); + + sysfs_remove_group(oa->metrics_kobj, &oa_config->sysfs_metric); + + idr_remove(&oa->metrics_idr, *arg); + + mutex_unlock(&oa->metrics_lock); + + drm_dbg(&oa->xe->drm, "Removed config %s id=%i\n", oa_config->uuid, oa_config->id); + + xe_oa_config_put(oa_config); + + return 0; + +err_unlock: + mutex_unlock(&oa->metrics_lock); + return ret; +} + void xe_oa_register(struct xe_device *xe) { struct xe_oa *oa = &xe->oa; @@ -259,6 +629,12 @@ int xe_oa_init(struct xe_device *xe) return 0; } +static int destroy_config(int id, void *p, void *data) +{ + xe_oa_config_put(p); + return 0; +} + void xe_oa_fini(struct xe_device *xe) { struct xe_oa *oa = &xe->oa; @@ -271,6 +647,7 @@ void xe_oa_fini(struct xe_device *xe) for_each_gt(gt, xe, i) kfree(gt->oa.group); + idr_for_each(&oa->metrics_idr, destroy_config, oa); idr_destroy(&oa->metrics_idr); oa->xe = NULL; diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h index ba4ba80fd34cb..79f77f445deb0 100644 --- a/drivers/gpu/drm/xe/xe_oa.h +++ b/drivers/gpu/drm/xe/xe_oa.h @@ -12,7 +12,12 @@ int xe_oa_init(struct xe_device *xe); void xe_oa_fini(struct xe_device *xe); void xe_oa_register(struct xe_device *xe); void xe_oa_unregister(struct xe_device *xe); +int xe_oa_ioctl_version(struct xe_device *xe); int xe_oa_sysctl_register(void); void xe_oa_sysctl_unregister(void); +int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); #endif From patchwork Tue Aug 8 01:21:50 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [06/11] drm/xe/oa: Start implementing OA stream open ioctl From: Ashutosh Dixit X-Patchwork-Id: 551680 Message-Id: <20230808012155.38531-7-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:50 -0700 Start implementing OA stream open ioctl and parse properties passed in as part of OA stream open. The remaining operations associated with OA stream open continue in subsequent patches. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_device.c | 1 + drivers/gpu/drm/xe/xe_oa.c | 240 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_oa.h | 2 + 3 files changed, 243 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 3772ce1dd234b..37fb9e6772e40 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -115,6 +115,7 @@ static const struct drm_ioctl_desc xe_ioctls[] = { DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(XE_VM_MADVISE, xe_vm_madvise_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_OA_OPEN, xe_oa_stream_open_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(XE_OA_ADD_CONFIG, xe_oa_add_config_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(XE_OA_REMOVE_CONFIG, xe_oa_remove_config_ioctl, DRM_RENDER_ALLOW), diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index b98eeab8573d7..0ea5af549bdaa 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -11,11 +11,16 @@ #include #include +#include "regs/xe_gt_regs.h" #include "regs/xe_oa_regs.h" #include "xe_device.h" #include "xe_gt.h" +#include "xe_mmio.h" #include "xe_oa.h" +#define DEFAULT_POLL_FREQUENCY_HZ 200 +#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) + static u32 xe_oa_stream_paranoid = true; static int xe_oa_sample_rate_hard_limit; static u32 xe_oa_max_sample_rate = 100000; @@ -31,6 +36,21 @@ static const struct xe_oa_format oa_formats[] = { [XE_OAM_FORMAT_MPEC8u32_B8_C8] = { 2, 128, TYPE_OAM, HDR_64_BIT }, }; +struct xe_oa_open_properties { + bool sample; + bool single_exec_q; + u64 exec_q_id; + + int metrics_set; + int oa_format; + bool oa_periodic; + int oa_period_exponent; + + struct xe_hw_engine *hwe; + + u64 poll_oa_period; +}; + static struct ctl_table_header *sysctl_header; static void xe_oa_config_release(struct kref *ref) @@ -53,6 +73,226 @@ static void xe_oa_config_put(struct xe_oa_config *oa_config) kref_put(&oa_config->ref, xe_oa_config_release); } +/* + * OA timestamp frequency = CS timestamp frequency in most platforms. On some + * platforms OA unit ignores the CTC_SHIFT and the 2 timestamps differ. In such + * cases, return the adjusted CS timestamp frequency to the user. + */ +u32 xe_oa_timestamp_frequency(struct xe_device *xe) +{ + struct xe_gt *gt = xe_root_mmio_gt(xe); + u32 reg, shift; + + /* + * Wa_18013179988:dg2 + * Wa_14015846243:mtl + */ + switch (xe->info.platform) { + case XE_DG2: + case XE_METEORLAKE: + xe_device_mem_access_get(xe); + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + reg = xe_mmio_read32(xe_root_mmio_gt(xe), RPM_CONFIG0); + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(xe); + + shift = REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, reg); + return xe_root_mmio_gt(xe)->info.clock_freq << (3 - shift); + + default: + return xe_root_mmio_gt(xe)->info.clock_freq; + } +} + +static u64 oa_exponent_to_ns(struct xe_oa *oa, int exponent) +{ + u64 nom = (2ULL << exponent) * NSEC_PER_SEC; + u32 den = xe_oa_timestamp_frequency(oa->xe); + + return div_u64(nom + den - 1, den); +} + +static bool oa_format_valid(struct xe_oa *oa, u64 format) +{ + if (format >= XE_OA_FORMAT_MAX) + return false; + return test_bit(format, oa->format_mask); +} + +static bool engine_supports_oa(const struct xe_hw_engine *hwe) +{ + return hwe->oa_group; +} + +static bool engine_supports_oa_format(const struct xe_hw_engine *hwe, int type) +{ + return hwe->oa_group && hwe->oa_group->type == type; +} + +#define OA_EXPONENT_MAX 31 + +static int xe_oa_read_properties_unlocked(struct xe_oa *oa, u64 __user *uprops, + u32 n_props, + struct xe_oa_open_properties *props) +{ + const struct xe_oa_format *f; + u64 __user *uprop = uprops; + bool config_instance = false; + bool config_class = false; + u8 class, instance; + struct xe_gt *gt; + u32 i; + int ret; + + if (!n_props || n_props >= DRM_XE_OA_PROP_MAX) { + drm_dbg(&oa->xe->drm, "Invalid number of xe perf properties given\n"); + return -EINVAL; + } + + props->poll_oa_period = DEFAULT_POLL_PERIOD_NS; + + /* Defaults when class:instance is not passed */ + class = XE_ENGINE_CLASS_RENDER; + instance = 0; + + for (i = 0; i < n_props; i++) { + u64 oa_period, oa_freq_hz; + u64 id, value; + + ret = get_user(id, uprop); + if (ret) + return ret; + + ret = get_user(value, uprop + 1); + if (ret) + return ret; + + switch ((enum drm_xe_oa_property_id)id) { + case DRM_XE_OA_PROP_EXEC_QUEUE_ID: + props->single_exec_q = true; + props->exec_q_id = value; + break; + case DRM_XE_OA_PROP_SAMPLE_OA: + props->sample = value; + break; + case DRM_XE_OA_PROP_OA_METRICS_SET: + if (!value) { + drm_dbg(&oa->xe->drm, "Unknown OA metric set ID\n"); + return -EINVAL; + } + props->metrics_set = value; + break; + case DRM_XE_OA_PROP_OA_FORMAT: + if (!oa_format_valid(oa, value)) { + drm_dbg(&oa->xe->drm, "Unsupported OA report format %llu\n", + value); + return -EINVAL; + } + props->oa_format = value; + break; + case DRM_XE_OA_PROP_OA_EXPONENT: + if (value > OA_EXPONENT_MAX) { + drm_dbg(&oa->xe->drm, "OA timer exponent too high (> %u)\n", + OA_EXPONENT_MAX); + return -EINVAL; + } + + BUILD_BUG_ON(sizeof(oa_period) != 8); + oa_period = oa_exponent_to_ns(oa, value); + + oa_freq_hz = div64_u64(NSEC_PER_SEC, oa_period); + if (oa_freq_hz > xe_oa_max_sample_rate && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, + "OA exponent would exceed the max sampling frequency (sysctl dev.xe.oa_max_sample_rate) %uHz without CAP_PERFMON or CAP_SYS_ADMIN privileges\n", + xe_oa_max_sample_rate); + return -EACCES; + } + + props->oa_periodic = true; + props->oa_period_exponent = value; + break; + case DRM_XE_OA_PROP_POLL_OA_PERIOD: + if (value < 100000 /* 100us */) { + drm_dbg(&oa->xe->drm, "OA timer too small (%lluns < 100us)\n", + value); + return -EINVAL; + } + props->poll_oa_period = value; + break; + case DRM_XE_OA_PROP_OA_ENGINE_CLASS: + class = (u8)value; + config_class = true; + break; + case DRM_XE_OA_PROP_OA_ENGINE_INSTANCE: + instance = (u8)value; + config_instance = true; + break; + default: + drm_dbg(&oa->xe->drm, "Unknown xe oa property ID\n"); + return -EINVAL; + } + + uprop += 2; + } + + if ((config_class && !config_instance) || + (config_instance && !config_class)) { + drm_dbg(&oa->xe->drm, "OA engine class/instance parameters must be passed together\n"); + return -EINVAL; + } + + for_each_gt(gt, oa->xe, i) { + props->hwe = xe_gt_hw_engine(gt, class, instance, false); + if (props->hwe) + break; + } + if (!props->hwe) { + drm_dbg(&oa->xe->drm, "OA engine class and instance invalid %d:%d\n", + class, instance); + return -EINVAL; + } + + if (!engine_supports_oa(props->hwe)) { + drm_dbg(&oa->xe->drm, "Engine not supported by OA %d:%d\n", + class, instance); + return -EINVAL; + } + + f = &oa->oa_formats[props->oa_format]; + if (!props->oa_format || !f->size || + !engine_supports_oa_format(props->hwe, f->type)) { + drm_dbg(&oa->xe->drm, "Invalid OA format %d type %d size %d for class %d\n", + props->oa_format, f->type, f->size, props->hwe->class); + return -EINVAL; + } + + return 0; +} + +int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_oa *oa = &to_xe_device(dev)->oa; + struct drm_xe_oa_open_param *param = data; + struct xe_oa_open_properties props = {}; + u32 known_open_flags; + + if (!oa->xe) { + drm_dbg(&oa->xe->drm, "xe oa interface not available for this system\n"); + return -ENODEV; + } + + known_open_flags = XE_OA_FLAG_FD_CLOEXEC | XE_OA_FLAG_FD_NONBLOCK | XE_OA_FLAG_DISABLED; + if (param->flags & ~known_open_flags) { + drm_dbg(&oa->xe->drm, "Unknown drm_xe_oa_open_param flag\n"); + return -EINVAL; + } + + return xe_oa_read_properties_unlocked(oa, u64_to_user_ptr(param->properties_ptr), + param->num_properties, + &props); +} + static bool xe_oa_is_valid_flex_addr(struct xe_oa *oa, u32 addr) { static const struct xe_reg flex_eu_regs[] = { diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h index 79f77f445deb0..fd6caf652047a 100644 --- a/drivers/gpu/drm/xe/xe_oa.h +++ b/drivers/gpu/drm/xe/xe_oa.h @@ -16,6 +16,8 @@ int xe_oa_ioctl_version(struct xe_device *xe); int xe_oa_sysctl_register(void); void xe_oa_sysctl_unregister(void); +int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, struct drm_file *file); int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, From patchwork Tue Aug 8 01:21:51 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [07/11] drm/xe/oa: OA stream initialization From: Ashutosh Dixit X-Patchwork-Id: 551686 Message-Id: <20230808012155.38531-8-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:51 -0700 Implement majority of OA stream initialization (as part of OA stream open ioctl). The OA buffer is allocated for receiving perf counter samples from HW. The selected counter configuration is programmed into OA unit HW using a command/batch buffer. For OAR, the render context image is modified so as to have correct register values when the context switches in. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_oa.c | 678 ++++++++++++++++++++++++++++++++++++- 1 file changed, 675 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 0ea5af549bdaa..bb34bd604c2c1 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -11,13 +11,26 @@ #include #include +#include "regs/xe_engine_regs.h" #include "regs/xe_gt_regs.h" +#include "regs/xe_lrc_layout.h" #include "regs/xe_oa_regs.h" +#include "regs/xe_regs.h" +#include "xe_bb.h" +#include "xe_bo.h" #include "xe_device.h" +#include "xe_exec_queue.h" #include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_lrc.h" +#include "xe_migrate.h" #include "xe_mmio.h" #include "xe_oa.h" +#include "xe_sched_job.h" +#include "xe_vm.h" +#define OA_BUFFER_SIZE SZ_16M +#define OA_TAKEN(tail, head) (((tail) - (head)) & (OA_BUFFER_SIZE - 1)) #define DEFAULT_POLL_FREQUENCY_HZ 200 #define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) @@ -25,6 +38,12 @@ static u32 xe_oa_stream_paranoid = true; static int xe_oa_sample_rate_hard_limit; static u32 xe_oa_max_sample_rate = 100000; +struct flex { + struct xe_reg reg; + u32 offset; + u32 value; +}; + static const struct xe_oa_format oa_formats[] = { [XE_OA_FORMAT_C4_B8] = { 7, 64 }, [XE_OA_FORMAT_A12] = { 0, 64 }, @@ -51,6 +70,13 @@ struct xe_oa_open_properties { u64 poll_oa_period; }; +struct xe_oa_config_bo { + struct llist_node node; + + struct xe_oa_config *oa_config; + struct xe_bb *bb; +}; + static struct ctl_table_header *sysctl_header; static void xe_oa_config_release(struct kref *ref) @@ -73,6 +99,640 @@ static void xe_oa_config_put(struct xe_oa_config *oa_config) kref_put(&oa_config->ref, xe_oa_config_release); } +static struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config) +{ + return kref_get_unless_zero(&oa_config->ref) ? oa_config : NULL; +} + +static struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set) +{ + struct xe_oa_config *oa_config; + + rcu_read_lock(); + oa_config = idr_find(&oa->metrics_idr, metrics_set); + if (oa_config) + oa_config = xe_oa_config_get(oa_config); + rcu_read_unlock(); + + return oa_config; +} + +static void free_oa_config_bo(struct xe_oa_config_bo *oa_bo) +{ + xe_oa_config_put(oa_bo->oa_config); + xe_bb_free(oa_bo->bb, NULL); + kfree(oa_bo); +} + +static const struct xe_oa_regs *__oa_regs(struct xe_oa_stream *stream) +{ + return &stream->hwe->oa_group->regs; +} + +static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) +{ + struct xe_hw_engine *hwe = stream->hwe; + struct xe_sched_job *job; + struct xe_exec_queue *q; + struct dma_fence *fence; + struct xe_vm *vm; + u64 batch_ofs; + long timeout; + int err = 0; + + vm = xe_migrate_get_vm(stream->gt->tile->migrate); + q = xe_exec_queue_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1, + hwe, EXEC_QUEUE_FLAG_WA); + if (IS_ERR(q)) { + err = PTR_ERR(q); + drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_exec_queue_create failed=%d", + stream->gt->info.id, hwe->name, err); + goto put_vm; + } + + batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo); + /* Will add MI_BATCH_BUFFER_END */ + job = xe_bb_create_wa_job(q, bb, batch_ofs); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto put_exec_q; + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + if (timeout < 0) + err = timeout; + else if (!timeout) + err = -ETIME; +put_exec_q: + xe_exec_queue_put(q); +put_vm: + xe_vm_put(vm); + + return err; +} + +static void xe_oa_free_oa_buffer(struct xe_oa_stream *stream) +{ + xe_bo_unpin_map_no_vm(stream->oa_buffer.bo); +} + +static void xe_oa_free_configs(struct xe_oa_stream *stream) +{ + struct xe_oa_config_bo *oa_bo, *tmp; + + xe_oa_config_put(stream->oa_config); + llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node) + free_oa_config_bo(oa_bo); +} + +static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, + struct xe_bb *bb, const struct flex *flex, u32 count) +{ + u32 offset = xe_bo_ggtt_addr(lrc->bo); + + do { + bb->cs[bb->len++] = MI_STORE_DWORD_IMM_GEN4 | MI_SRM_LRM_GLOBAL_GTT; + bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); + bb->cs[bb->len++] = 0; + bb->cs[bb->len++] = flex->value; + + } while (flex++, --count); +} + +static int xe_oa_modify_context(struct xe_oa_stream *stream, struct xe_lrc *lrc, + const struct flex *flex, u32 count) +{ + struct xe_bb *bb; + int err = 0; + + bb = xe_bb_new(stream->gt, 4 * count + 1, false); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto exit; + } + + xe_oa_store_flex(stream, lrc, bb, flex, count); + + err = xe_oa_submit_bb(stream, bb); + xe_bb_free(bb, NULL); +exit: + return err; +} + +static void xe_oa_load_flex(struct xe_oa_stream *stream, struct xe_bb *bb, + const struct flex *flex, u32 count) +{ + XE_WARN_ON(!count || count > 63); + + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM(count); + + do { + bb->cs[bb->len++] = flex->reg.addr; + bb->cs[bb->len++] = flex->value; + + } while (flex++, --count); + + bb->cs[bb->len++] = MI_NOOP; +} + +static int xe_oa_modify_self(struct xe_oa_stream *stream, + const struct flex *flex, u32 count) +{ + struct xe_bb *bb; + int err = 0; + + bb = xe_bb_new(stream->gt, 2 * count + 3, false); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto exit; + } + + xe_oa_load_flex(stream, bb, flex, count); + + err = xe_oa_submit_bb(stream, bb); + xe_bb_free(bb, NULL); +exit: + return err; +} + +static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) +{ + int err; + u32 format = stream->oa_buffer.format->format; + u32 offset = stream->oa->ctx_oactxctrl_offset; + struct flex regs_context[] = { + { + GEN8_OACTXCONTROL, + offset + 1, + enable ? GEN8_OA_COUNTER_RESUME : 0, + }, + }; +#define GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE BIT(8) +#define GEN12_OAR_OACONTROL_OFFSET 0x5B0 + /* Offsets in regs_lri are not used since this configuration is applied using LRI */ + struct flex regs_lri[] = { + { + GEN12_OAR_OACONTROL, + GEN12_OAR_OACONTROL_OFFSET + 1, + (format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) | + (enable ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0) + }, + { + RING_CONTEXT_CONTROL(stream->hwe->mmio_base), + CTX_CONTEXT_CONTROL, + _MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE, + enable ? + GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE : + 0) + }, + }; + + /* Modify stream hwe context image with regs_context */ + err = xe_oa_modify_context(stream, &stream->exec_q->lrc[0], + regs_context, ARRAY_SIZE(regs_context)); + if (err) + return err; + + /* Apply regs_lri using LRI */ + return xe_oa_modify_self(stream, regs_lri, ARRAY_SIZE(regs_lri)); +} + +#define HAS_OA_BPC_REPORTING(xe) (GRAPHICS_VERx100(xe) >= 1255) + +static void xe_oa_disable_metric_set(struct xe_oa_stream *stream) +{ + u32 sqcnt1; + + /* + * Wa_1508761755:xehpsdv, dg2 + * Enable thread stall DOP gating and EU DOP gating. + */ + if (stream->gt->tile->xe->info.platform == XE_DG2) { + xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN, + _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); + xe_mmio_write32(stream->gt, GEN7_ROW_CHICKEN2, + _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); + } + + /* disable the context save/restore or OAR counters */ + if (stream->exec_q) + xe_oa_configure_oar_context(stream, false); + + /* Make sure we disable noa to save power. */ + xe_mmio_rmw32(stream->gt, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0); + + sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | + (HAS_OA_BPC_REPORTING(stream->gt->tile->xe) ? GEN12_SQCNT1_OABPC : 0); + + /* Reset PMON Enable to save power. */ + xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, sqcnt1, 0); +} + +static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream) +{ + struct xe_bo *bo; + + BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE); + BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M); + + bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL, + OA_BUFFER_SIZE, ttm_bo_type_kernel, + XE_BO_CREATE_SYSTEM_BIT | XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + stream->oa_buffer.bo = bo; + stream->oa_buffer.vaddr = bo->vmap.is_iomem ? + bo->vmap.vaddr_iomem : bo->vmap.vaddr; + return 0; +} + +static void write_cs_mi_lri(struct xe_bb *bb, const struct xe_oa_reg *reg_data, u32 n_regs) +{ + u32 i; + +#define MI_LOAD_REGISTER_IMM_MAX_REGS (126) + + for (i = 0; i < n_regs; i++) { + if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) { + u32 n_lri = min_t(u32, n_regs - i, + MI_LOAD_REGISTER_IMM_MAX_REGS); + + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM(n_lri); + } + bb->cs[bb->len++] = reg_data[i].addr.addr; + bb->cs[bb->len++] = reg_data[i].value; + } +} + +static int num_lri_dwords(int num_regs) +{ + int count = 0; + + if (num_regs > 0) { + count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS); + count += num_regs * 2; + } + + return count; +} + +static struct xe_oa_config_bo * +__xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa_config) +{ + struct xe_oa_config_bo *oa_bo; + size_t config_length = 0; + struct xe_bb *bb; + + oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL); + if (!oa_bo) + return ERR_PTR(-ENOMEM); + + config_length += num_lri_dwords(oa_config->mux_regs_len); + config_length += num_lri_dwords(oa_config->b_counter_regs_len); + config_length += num_lri_dwords(oa_config->flex_regs_len); + config_length++; /* MI_BATCH_BUFFER_END */ + config_length = ALIGN(sizeof(u32) * config_length, XE_PAGE_SIZE) / sizeof(u32); + + bb = xe_bb_new(stream->gt, config_length, false); + if (IS_ERR(bb)) + goto err_free; + + write_cs_mi_lri(bb, oa_config->mux_regs, oa_config->mux_regs_len); + write_cs_mi_lri(bb, oa_config->b_counter_regs, oa_config->b_counter_regs_len); + write_cs_mi_lri(bb, oa_config->flex_regs, oa_config->flex_regs_len); + + oa_bo->bb = bb; + oa_bo->oa_config = xe_oa_config_get(oa_config); + llist_add(&oa_bo->node, &stream->oa_config_bos); + + return oa_bo; +err_free: + kfree(oa_bo); + return ERR_CAST(bb); +} + +static struct xe_oa_config_bo *xe_oa_alloc_config_buffer(struct xe_oa_stream *stream) +{ + struct xe_oa_config *oa_config = stream->oa_config; + struct xe_oa_config_bo *oa_bo; + + /* Look for the buffer in the already allocated BOs attached to the stream */ + llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) { + if (oa_bo->oa_config == oa_config && + memcmp(oa_bo->oa_config->uuid, oa_config->uuid, + sizeof(oa_config->uuid)) == 0) + goto out; + } + + oa_bo = __xe_oa_alloc_config_buffer(stream, oa_config); +out: + return oa_bo; +} + +static int xe_oa_emit_oa_config(struct xe_oa_stream *stream) +{ + struct xe_oa_config_bo *oa_bo; + int err = 0; + + oa_bo = xe_oa_alloc_config_buffer(stream); + if (IS_ERR(oa_bo)) { + err = PTR_ERR(oa_bo); + goto exit; + } + + err = xe_oa_submit_bb(stream, oa_bo->bb); +exit: + return err; +} + +static u32 oag_report_ctx_switches(const struct xe_oa_stream *stream) +{ + return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS, + stream->sample ? + 0 : GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS); +} + +static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) +{ + u32 sqcnt1; + int ret; + + /* + * Wa_1508761755:xehpsdv, dg2 + * EU NOA signals behave incorrectly if EU clock gating is enabled. + * Disable thread stall DOP gating and EU DOP gating. + */ + if (stream->gt->tile->xe->info.platform == XE_DG2) { + xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN, + _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); + xe_mmio_write32(stream->gt, GEN7_ROW_CHICKEN2, + _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); + } + + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_debug, + /* Disable clk ratio reports, like previous Gens. */ + _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS | + GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) | + /* + * If the user didn't require OA reports, instruct the hardware + * not to emit ctx switch reports. + */ + oag_report_ctx_switches(stream)); + + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_ctx_ctrl, stream->periodic ? + (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME | + GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE | + (stream->period_exponent << + GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT)) : 0); + + /* + * Initialize Super Queue Internal Cnt Register + * Set PMON Enable in order to collect valid metrics. + * Enable bytes per clock reporting in OA for XEHPSDV onward. + */ + sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | + (HAS_OA_BPC_REPORTING(stream->gt->tile->xe) ? GEN12_SQCNT1_OABPC : 0); + + xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, 0, sqcnt1); + + /* + * For Gen12, performance counters are context saved/restored. Only enable it + * for the context that requested this. + */ + if (stream->exec_q) { + ret = xe_oa_configure_oar_context(stream, true); + if (ret) + return ret; + } + + return xe_oa_emit_oa_config(stream); +} + +static bool engine_supports_mi_query(struct xe_hw_engine *hwe) +{ + return hwe->class == XE_ENGINE_CLASS_RENDER; +} + +#define MI_LRI_LEN(x) (((x) & 0xff) + 1) + +static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) +{ + u32 idx = *offset; + u32 len = min(MI_LRI_LEN(state[idx]) + idx, end); + bool found = false; + + idx++; + for (; idx < len; idx += 2) { + if (state[idx] == reg) { + found = true; + break; + } + } + + *offset = idx; + return found; +} + +static u32 xe_oa_context_image_offset(struct xe_oa_stream *stream, u32 reg) +{ + u32 len = (xe_lrc_size(stream->gt->tile->xe, stream->hwe->class) - PAGE_SIZE) / 4; + u32 *state = stream->gt->default_lrc[stream->hwe->class]; + u32 offset; + + if (drm_WARN_ON(&stream->oa->xe->drm, !state)) + return U32_MAX; + + for (offset = 0; offset < len; ) { + if (IS_MI_LRI_CMD(state[offset])) { + /* + * We expect reg-value pairs in MI_LRI command, so + * MI_LRI_LEN() should be even + */ + drm_WARN_ON(&stream->oa->xe->drm, + MI_LRI_LEN(state[offset]) & 0x1); + + if (xe_oa_find_reg_in_lri(state, reg, &offset, len)) + break; + } else { + offset++; + } + } + + return offset < len ? offset : U32_MAX; +} + +static int xe_oa_set_ctx_ctrl_offset(struct xe_oa_stream *stream) +{ + struct xe_reg reg = GEN12_OACTXCONTROL(stream->hwe->mmio_base); + u32 offset = stream->oa->ctx_oactxctrl_offset; + + /* Do this only once. Failure is stored as offset of U32_MAX */ + if (offset) + goto exit; + + offset = xe_oa_context_image_offset(stream, reg.addr); + stream->oa->ctx_oactxctrl_offset = offset; + + drm_dbg(&stream->oa->xe->drm, "%s oa ctx control at 0x%08x dword offset\n", + stream->hwe->name, offset); +exit: + return offset && offset != U32_MAX ? 0 : -ENODEV; +} + +static int xe_oa_stream_init(struct xe_oa_stream *stream, + struct xe_oa_open_properties *props) +{ + struct xe_oa_group *g = props->hwe->oa_group; + struct xe_gt *gt = props->hwe->gt; + struct xe_oa *oa = stream->oa; + int ret; + + stream->poll_oa_period = props->poll_oa_period; + stream->hwe = props->hwe; + stream->gt = stream->hwe->gt; + stream->sample_size = sizeof(struct drm_xe_oa_record_header); + stream->oa_buffer.format = &oa->oa_formats[props->oa_format]; + + stream->sample = props->sample; + stream->sample_size += stream->oa_buffer.format->size; + stream->periodic = props->oa_periodic; + stream->period_exponent = props->oa_period_exponent; + + if (stream->exec_q && engine_supports_mi_query(stream->hwe)) { + /* If we don't find the context offset, just return error */ + ret = xe_oa_set_ctx_ctrl_offset(stream); + if (ret) { + drm_err(&stream->gt->tile->xe->drm, + "xe_oa_set_ctx_ctrl_offset failed for %s\n", + stream->hwe->name); + goto exit; + } + } + + stream->oa_config = xe_oa_get_oa_config(oa, props->metrics_set); + if (!stream->oa_config) { + drm_dbg(&oa->xe->drm, "Invalid OA config id=%i\n", props->metrics_set); + ret = -EINVAL; + goto exit; + } + + ret = xe_oa_alloc_oa_buffer(stream); + if (ret) + goto err_free_configs; + + /* Take runtime pm ref and forcewake to disable RC6 */ + xe_device_mem_access_get(stream->oa->xe); + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + + ret = xe_oa_enable_metric_set(stream); + if (ret) { + drm_dbg(&oa->xe->drm, "Unable to enable metric set\n"); + goto err_fw_put; + } + + drm_dbg(&oa->xe->drm, "opening stream oa config uuid=%s\n", + stream->oa_config->uuid); + + WRITE_ONCE(g->exclusive_stream, stream); + + hrtimer_init(&stream->poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + init_waitqueue_head(&stream->poll_wq); + + spin_lock_init(&stream->oa_buffer.ptr_lock); + mutex_init(&stream->lock); + + return 0; + +err_fw_put: + xe_oa_disable_metric_set(stream); + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(stream->oa->xe); + xe_oa_free_oa_buffer(stream); +err_free_configs: + xe_oa_free_configs(stream); +exit: + return ret; +} + +static int +xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, + struct drm_xe_oa_open_param *param, + struct xe_oa_open_properties *props, + struct drm_file *file) +{ + struct xe_file *xef = to_xe_file(file); + struct xe_oa_stream *stream = NULL; + struct xe_exec_queue *q = NULL; + bool privileged_op = true; + int stream_fd; + int ret; + + if (props->single_exec_q) { + q = xe_exec_queue_lookup(xef, props->exec_q_id); + if (XE_IOCTL_DBG(oa->xe, !q)) { + ret = -ENOENT; + goto err_exec_q; + } + } + + /* + * The OAR unit only monitors the RCS on a per context basis. Relax + * requirements if the user doesn't request global stream access, + * i.e. query based sampling using MI_REPORT_PERF_COUNT + */ + if (q && !props->sample) + privileged_op = false; + + if (privileged_op && xe_oa_stream_paranoid && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, "Insufficient privileges to open xe perf stream\n"); + ret = -EACCES; + goto err_exec_q; + } + + if (!props->sample && !q) { + drm_dbg(&oa->xe->drm, "Only OA report sampling supported\n"); + ret = -EINVAL; + goto err_exec_q; + } + + /* We currently only allow exclusive access */ + if (props->hwe->oa_group->exclusive_stream) { + drm_dbg(&oa->xe->drm, "OA unit already in use\n"); + ret = -EBUSY; + goto err_exec_q; + } + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (!stream) { + ret = -ENOMEM; + goto err_exec_q; + } + + stream->oa = oa; + stream->exec_q = q; + + ret = xe_oa_stream_init(stream, props); + if (ret) + goto err_free; + + /* Hold a reference on the drm device till stream_fd is released */ + drm_dev_get(&oa->xe->drm); + + return stream_fd; +err_free: + kfree(stream); +err_exec_q: + if (q) + xe_exec_queue_put(q); + return ret; +} + /* * OA timestamp frequency = CS timestamp frequency in most platforms. On some * platforms OA unit ignores the CTC_SHIFT and the 2 timestamps differ. In such @@ -276,6 +936,8 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, struct drm_xe_oa_open_param *param = data; struct xe_oa_open_properties props = {}; u32 known_open_flags; + struct xe_gt *gt; + int ret; if (!oa->xe) { drm_dbg(&oa->xe->drm, "xe oa interface not available for this system\n"); @@ -288,9 +950,19 @@ int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, return -EINVAL; } - return xe_oa_read_properties_unlocked(oa, u64_to_user_ptr(param->properties_ptr), - param->num_properties, - &props); + ret = xe_oa_read_properties_unlocked(oa, u64_to_user_ptr(param->properties_ptr), + param->num_properties, + &props); + if (ret) + return ret; + + gt = props.hwe->gt; + + mutex_lock(>->oa.lock); + ret = xe_oa_stream_open_ioctl_locked(oa, param, &props, file); + mutex_unlock(>->oa.lock); + + return ret; } static bool xe_oa_is_valid_flex_addr(struct xe_oa *oa, u32 addr) From patchwork Tue Aug 8 01:21:52 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [08/11] drm/xe/oa: Expose OA stream fd From: Ashutosh Dixit X-Patchwork-Id: 551685 Message-Id: <20230808012155.38531-9-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:52 -0700 The OA stream open ioctl returns an fd with its own file_operations for the newly initialized OA stream. These file_operations allow userspace to enable or disable the stream, as well as apply a different counter configuration for the OA stream. Userspace can also poll for data availability. OA stream initialization is completed in this commit by enabling the OA stream. When sampling is enabled this starts a hrtimer which periodically checks for data availablility. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_oa.c | 387 +++++++++++++++++++++++++++++++++++++ 1 file changed, 387 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index bb34bd604c2c1..c97180997427e 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -12,6 +12,7 @@ #include #include "regs/xe_engine_regs.h" +#include "regs/xe_gpu_commands.h" #include "regs/xe_gt_regs.h" #include "regs/xe_lrc_layout.h" #include "regs/xe_oa_regs.h" @@ -26,6 +27,7 @@ #include "xe_migrate.h" #include "xe_mmio.h" #include "xe_oa.h" +#include "xe_pm.h" #include "xe_sched_job.h" #include "xe_vm.h" @@ -33,6 +35,7 @@ #define OA_TAKEN(tail, head) (((tail) - (head)) & (OA_BUFFER_SIZE - 1)) #define DEFAULT_POLL_FREQUENCY_HZ 200 #define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) +#define INVALID_CTX_ID U32_MAX static u32 xe_oa_stream_paranoid = true; static int xe_oa_sample_rate_hard_limit; @@ -129,6 +132,210 @@ static const struct xe_oa_regs *__oa_regs(struct xe_oa_stream *stream) return &stream->hwe->oa_group->regs; } +static u32 gen12_oa_hw_tail_read(struct xe_oa_stream *stream) +{ + return xe_mmio_read32(stream->gt, __oa_regs(stream)->oa_tail_ptr) & + GEN12_OAG_OATAILPTR_MASK; +} + +#define oa_report_header_64bit(__s) \ + ((__s)->oa_buffer.format->header == HDR_64_BIT) + +static u64 oa_report_id(struct xe_oa_stream *stream, void *report) +{ + return oa_report_header_64bit(stream) ? *(u64 *)report : *(u32 *)report; +} + +static u64 oa_timestamp(struct xe_oa_stream *stream, void *report) +{ + return oa_report_header_64bit(stream) ? + *((u64 *)report + 1) : + *((u32 *)report + 1); +} + +static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) +{ + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + int report_size = stream->oa_buffer.format->size; + u32 tail, hw_tail; + unsigned long flags; + bool pollin; + u32 partial_report_size; + + /* + * We have to consider the (unlikely) possibility that read() errors could result + * in an OA buffer reset which might reset the head and tail state. + */ + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + hw_tail = gen12_oa_hw_tail_read(stream); + hw_tail -= gtt_offset; + + /* + * The tail pointer increases in 64 byte increments, not in report_size + * steps. Also the report size may not be a power of 2. Compute potentially + * partially landed report in the OA buffer + */ + partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail); + partial_report_size %= report_size; + + /* Subtract partial amount off the tail */ + hw_tail = OA_TAKEN(hw_tail, partial_report_size); + + tail = hw_tail; + + /* + * Walk the stream backward until we find a report with report id and timestmap + * not at 0. Since the circular buffer pointers progress by increments of 64 bytes + * and that reports can be up to 256 bytes long, we can't tell whether a report + * has fully landed in memory before the report id and timestamp of the following + * report have effectively landed. + * + * This is assuming that the writes of the OA unit land in memory in the order + * they were written to. If not : (╯°□°)╯︵ ┻━┻ + */ + while (OA_TAKEN(tail, stream->oa_buffer.tail) >= report_size) { + void *report = stream->oa_buffer.vaddr + tail; + + if (oa_report_id(stream, report) || + oa_timestamp(stream, report)) + break; + + tail = OA_TAKEN(tail, report_size); + } + + if (OA_TAKEN(hw_tail, tail) > report_size) + drm_dbg(&stream->oa->xe->drm, + "unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n", + stream->oa_buffer.head, tail, hw_tail); + + stream->oa_buffer.tail = tail; + + pollin = OA_TAKEN(stream->oa_buffer.tail, + stream->oa_buffer.head) >= report_size; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + + return pollin; +} + +static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) +{ + struct xe_oa_stream *stream = + container_of(hrtimer, typeof(*stream), poll_check_timer); + + if (xe_oa_buffer_check_unlocked(stream)) { + stream->pollin = true; + wake_up(&stream->poll_wq); + } + + hrtimer_forward_now(hrtimer, ns_to_ktime(stream->poll_oa_period)); + + return HRTIMER_RESTART; +} + +static void xe_oa_init_oa_buffer(struct xe_oa_stream *stream) +{ + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + unsigned long flags; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_status, 0); + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_head_ptr, + gtt_offset & GEN12_OAG_OAHEADPTR_MASK); + stream->oa_buffer.head = 0; + + /* + * PRM says: "This MMIO must be set before the OATAILPTR register and after the + * OAHEADPTR register. This is to enable proper functionality of the overflow bit". + */ + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_buffer, gtt_offset | + OABUFFER_SIZE_16M | GEN12_OAG_OABUFFER_MEMORY_SELECT); + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_tail_ptr, + gtt_offset & GEN12_OAG_OATAILPTR_MASK); + + /* Mark that we need updated tail pointers to read from... */ + stream->oa_buffer.tail = 0; + + /* + * Reset state used to recognise context switches, affecting which reports we will + * forward to userspace while filtering for a single context. + */ + stream->oa_buffer.last_ctx_id = INVALID_CTX_ID; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + + /* Zero out the OA buffer since we rely on zero report id and timestamp fields */ + memset(stream->oa_buffer.vaddr, 0, stream->oa_buffer.bo->size); +} + +static void xe_oa_enable(struct xe_oa_stream *stream) +{ + const struct xe_oa_regs *regs; + u32 val; + + /* + * If we don't want OA reports from the OA buffer, then we don't + * even need to program the OAG unit. + */ + if (!stream->sample) + return; + + xe_oa_init_oa_buffer(stream); + + regs = __oa_regs(stream); + val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) | + GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE; + + xe_mmio_write32(stream->gt, regs->oa_ctrl, val); +} + +static void xe_oa_disable(struct xe_oa_stream *stream) +{ + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_ctrl, 0); + if (xe_mmio_wait32(stream->gt, __oa_regs(stream)->oa_ctrl, + GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0, 50000, NULL, false)) + drm_err(&stream->oa->xe->drm, + "wait for OA to be disabled timed out\n"); + + xe_mmio_write32(stream->gt, GEN12_OA_TLB_INV_CR, 1); + if (xe_mmio_wait32(stream->gt, GEN12_OA_TLB_INV_CR, 1, 0, 50000, NULL, false)) + drm_err(&stream->oa->xe->drm, + "wait for OA tlb invalidate timed out\n"); +} + +static __poll_t xe_oa_poll_locked(struct xe_oa_stream *stream, + struct file *file, poll_table *wait) +{ + __poll_t events = 0; + + poll_wait(file, &stream->poll_wq, wait); + + /* + * We don't explicitly check whether there's something to read here since this + * path may be hot depending on what else userspace is polling, or on the timeout + * in use. We rely on hrtimer/xe_oa_poll_check_timer_cb to notify us when there + * are samples to read. + */ + if (stream->pollin) + events |= EPOLLIN; + + return events; +} + +static __poll_t xe_oa_poll(struct file *file, poll_table *wait) +{ + struct xe_oa_stream *stream = file->private_data; + __poll_t ret; + + mutex_lock(&stream->lock); + ret = xe_oa_poll_locked(stream, file, wait); + mutex_unlock(&stream->lock); + + return ret; +} + static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) { struct xe_hw_engine *hwe = stream->hwe; @@ -333,6 +540,26 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream) xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, sqcnt1, 0); } +static void xe_oa_stream_destroy(struct xe_oa_stream *stream) +{ + struct xe_oa_group *g = stream->hwe->oa_group; + struct xe_gt *gt = stream->hwe->gt; + + if (WARN_ON(stream != g->exclusive_stream)) + return; + + /* Unset exclusive_stream first */ + WRITE_ONCE(g->exclusive_stream, NULL); + xe_oa_disable_metric_set(stream); + + xe_oa_free_oa_buffer(stream); + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(stream->oa->xe); + + xe_oa_free_configs(stream); +} + static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream) { struct xe_bo *bo; @@ -514,6 +741,148 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) return xe_oa_emit_oa_config(stream); } +static void xe_oa_stream_enable(struct xe_oa_stream *stream) +{ + stream->pollin = false; + + xe_oa_enable(stream); + + if (stream->sample) + hrtimer_start(&stream->poll_check_timer, + ns_to_ktime(stream->poll_oa_period), + HRTIMER_MODE_REL_PINNED); +} + +static void xe_oa_stream_disable(struct xe_oa_stream *stream) +{ + xe_oa_disable(stream); + + if (stream->sample) + hrtimer_cancel(&stream->poll_check_timer); +} + +static void xe_oa_enable_locked(struct xe_oa_stream *stream) +{ + if (stream->enabled) + return; + + stream->enabled = true; + + xe_oa_stream_enable(stream); +} + +static void xe_oa_disable_locked(struct xe_oa_stream *stream) +{ + if (!stream->enabled) + return; + + stream->enabled = false; + + xe_oa_stream_disable(stream); +} + +static long xe_oa_config_locked(struct xe_oa_stream *stream, + unsigned long metrics_set) +{ + struct xe_oa_config *config; + long ret = stream->oa_config->id; + + config = xe_oa_get_oa_config(stream->oa, metrics_set); + if (!config) + return -ENODEV; + + if (config != stream->oa_config) { + int err; + + /* + * If OA is bound to a specific engine, emit the reconfiguration + * inline from that engine. The update will then be ordered with + * respect to submission on that engine. + */ + err = xe_oa_emit_oa_config(stream); + if (!err) + config = xchg(&stream->oa_config, config); + else + ret = err; + } + + xe_oa_config_put(config); + + return ret; +} + +static long xe_oa_ioctl_locked(struct xe_oa_stream *stream, + unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case XE_OA_IOCTL_ENABLE: + xe_oa_enable_locked(stream); + return 0; + case XE_OA_IOCTL_DISABLE: + xe_oa_disable_locked(stream); + return 0; + case XE_OA_IOCTL_CONFIG: + return xe_oa_config_locked(stream, arg); + } + + return -EINVAL; +} + +static long xe_oa_ioctl(struct file *file, + unsigned int cmd, + unsigned long arg) +{ + struct xe_oa_stream *stream = file->private_data; + long ret; + + mutex_lock(&stream->lock); + ret = xe_oa_ioctl_locked(stream, cmd, arg); + mutex_unlock(&stream->lock); + + return ret; +} + +static void xe_oa_destroy_locked(struct xe_oa_stream *stream) +{ + if (stream->enabled) + xe_oa_disable_locked(stream); + + xe_oa_stream_destroy(stream); + + if (stream->exec_q) + xe_exec_queue_put(stream->exec_q); + + kfree(stream); +} + +static int xe_oa_release(struct inode *inode, struct file *file) +{ + struct xe_oa_stream *stream = file->private_data; + struct xe_gt *gt = stream->gt; + + /* + * Within this call, we know that the fd is being closed and we have no other + * user of stream->lock. Use the perf lock to destroy the stream here. + */ + mutex_lock(>->oa.lock); + xe_oa_destroy_locked(stream); + mutex_unlock(>->oa.lock); + + /* Release the reference the perf stream kept on the driver. */ + drm_dev_put(>->tile->xe->drm); + + return 0; +} + +static const struct file_operations xe_oa_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .release = xe_oa_release, + .poll = xe_oa_poll, + .unlocked_ioctl = xe_oa_ioctl, +}; + static bool engine_supports_mi_query(struct xe_hw_engine *hwe) { return hwe->class == XE_ENGINE_CLASS_RENDER; @@ -642,6 +1011,7 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, WRITE_ONCE(g->exclusive_stream, stream); hrtimer_init(&stream->poll_check_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + stream->poll_check_timer.function = xe_oa_poll_check_timer_cb; init_waitqueue_head(&stream->poll_wq); spin_lock_init(&stream->oa_buffer.ptr_lock); @@ -669,6 +1039,7 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, struct xe_file *xef = to_xe_file(file); struct xe_oa_stream *stream = NULL; struct xe_exec_queue *q = NULL; + unsigned long f_flags = 0; bool privileged_op = true; int stream_fd; int ret; @@ -721,10 +1092,26 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, if (ret) goto err_free; + if (param->flags & XE_OA_FLAG_FD_CLOEXEC) + f_flags |= O_CLOEXEC; + if (param->flags & XE_OA_FLAG_FD_NONBLOCK) + f_flags |= O_NONBLOCK; + + stream_fd = anon_inode_getfd("[xe_oa]", &xe_oa_fops, stream, f_flags); + if (stream_fd < 0) { + ret = stream_fd; + goto err_destroy; + } + + if (!(param->flags & XE_OA_FLAG_DISABLED)) + xe_oa_enable_locked(stream); + /* Hold a reference on the drm device till stream_fd is released */ drm_dev_get(&oa->xe->drm); return stream_fd; +err_destroy: + xe_oa_stream_destroy(stream); err_free: kfree(stream); err_exec_q: From patchwork Tue Aug 8 01:21:53 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [09/11] drm/xe/oa: Read file_operation From: Ashutosh Dixit X-Patchwork-Id: 551688 Message-Id: <20230808012155.38531-10-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:53 -0700 Finally implement the OA stream read file_operation which was the only fop missing in the previous commit. Both blocking and non-blocking reads are supported. The read copies OA perf data from the OA buffer to the user buffer provided as part of read system call. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_oa.c | 359 +++++++++++++++++++++++++++++++++++++ 1 file changed, 359 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index c97180997427e..fcbb352f36a48 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -146,6 +146,29 @@ static u64 oa_report_id(struct xe_oa_stream *stream, void *report) return oa_report_header_64bit(stream) ? *(u64 *)report : *(u32 *)report; } +#define OAREPORT_REASON_MASK_EXTENDED GENMASK(25, 19) +#define OAREPORT_REASON_TIMER BIT(0) +#define OAREPORT_REASON_CTX_SWITCH BIT(3) +#define OAREPORT_REASON_CLK_RATIO BIT(5) + +static u64 oa_report_reason(struct xe_oa_stream *stream, void *report) +{ + return FIELD_GET(OAREPORT_REASON_MASK_EXTENDED, oa_report_id(stream, report)); +} + +static void oa_report_id_clear(struct xe_oa_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + *(u64 *)report = 0; + else + *report = 0; +} + +static bool oa_report_ctx_invalid(struct xe_oa_stream *stream, void *report) +{ + return false; +} + static u64 oa_timestamp(struct xe_oa_stream *stream, void *report) { return oa_report_header_64bit(stream) ? @@ -153,6 +176,29 @@ static u64 oa_timestamp(struct xe_oa_stream *stream, void *report) *((u32 *)report + 1); } +static void oa_timestamp_clear(struct xe_oa_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + *(u64 *)&report[2] = 0; + else + report[1] = 0; +} + +static u32 oa_context_id(struct xe_oa_stream *stream, u32 *report) +{ + u32 ctx_id = oa_report_header_64bit(stream) ? report[4] : report[2]; + + return ctx_id & stream->specific_ctx_id_mask; +} + +static void oa_context_id_squash(struct xe_oa_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + report[4] = INVALID_CTX_ID; + else + report[2] = INVALID_CTX_ID; +} + static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) { u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); @@ -234,6 +280,199 @@ static enum hrtimer_restart xe_oa_poll_check_timer_cb(struct hrtimer *hrtimer) return HRTIMER_RESTART; } +static int xe_oa_append_status(struct xe_oa_stream *stream, char __user *buf, + size_t count, size_t *offset, + enum drm_xe_oa_record_type type) +{ + struct drm_xe_oa_record_header header = { type, 0, sizeof(header) }; + + if ((count - *offset) < header.size) + return -ENOSPC; + + if (copy_to_user(buf + *offset, &header, sizeof(header))) + return -EFAULT; + + *offset += header.size; + + return 0; +} + +static int xe_oa_append_sample(struct xe_oa_stream *stream, char __user *buf, + size_t count, size_t *offset, const u8 *report) +{ + int report_size = stream->oa_buffer.format->size; + struct drm_xe_oa_record_header header; + int report_size_partial; + u8 *oa_buf_end; + + header.type = DRM_XE_OA_RECORD_SAMPLE; + header.pad = 0; + header.size = stream->sample_size; + + if ((count - *offset) < header.size) + return -ENOSPC; + + buf += *offset; + if (copy_to_user(buf, &header, sizeof(header))) + return -EFAULT; + buf += sizeof(header); + + oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; + report_size_partial = oa_buf_end - report; + + if (report_size_partial < report_size) { + if (copy_to_user(buf, report, report_size_partial)) + return -EFAULT; + buf += report_size_partial; + + if (copy_to_user(buf, stream->oa_buffer.vaddr, + report_size - report_size_partial)) + return -EFAULT; + } else if (copy_to_user(buf, report, report_size)) { + return -EFAULT; + } + + *offset += header.size; + + return 0; +} + +static int xe_oa_append_reports(struct xe_oa_stream *stream, char __user *buf, + size_t count, size_t *offset) +{ + int report_size = stream->oa_buffer.format->size; + u8 *oa_buf_base = stream->oa_buffer.vaddr; + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + u32 mask = (OA_BUFFER_SIZE - 1); + size_t start_offset = *offset; + unsigned long flags; + u32 head, tail; + int ret = 0; + + if (drm_WARN_ON(&stream->oa->xe->drm, !stream->enabled)) + return -EIO; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + head = stream->oa_buffer.head; + tail = stream->oa_buffer.tail; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + + /* An out of bounds or misaligned head or tail pointer implies a driver bug */ + if (drm_WARN_ONCE(&stream->oa->xe->drm, + head > OA_BUFFER_SIZE || tail > OA_BUFFER_SIZE, + "Inconsistent OA buffer pointers: head = %u, tail = %u\n", + head, tail)) + return -EIO; + + for (/* none */; OA_TAKEN(tail, head); head = (head + report_size) & mask) { + u8 *report = oa_buf_base + head; + u32 ctx_id, *report32 = (void *)report; + u64 reason; + + /* + * The reason field indicates what triggered this report (e.g. timer + * triggered or a context switch). + * + * In MMIO triggered reports, some platforms do not set the reason bit in + * this field and it is valid to have a reason field of zero. + */ + reason = oa_report_reason(stream, report); + ctx_id = oa_context_id(stream, report32); + + /* + * Squash whatever is in the CTX_ID field if it's marked as invalid to be + * sure we avoid false-positive, single-context filtering below... + * + * Note: we don't clear the valid_ctx_bit so userspace can understand that + * the ID has been squashed by the kernel. + */ + if (oa_report_ctx_invalid(stream, report)) { + ctx_id = INVALID_CTX_ID; + oa_context_id_squash(stream, report32); + } + + /* + * NB: The OA unit does not support clock gating off for a specific + * context and the kernel can't securely stop counters from updating as + * system-wide/global values. + * + * Automatic reports include a context ID so reports can be filtered on + * the cpu but it's not worth trying to automatically subtract/hide + * counter progress for other contexts while filtering since userspace can + * issue MI_REPORT_PERF_COUNT commands which would still provide a + * side-band view of the real values. + * + * To allow userspace to normalize counters for a single filtered context + * then it needs be forwarded bookend context-switch reports so that it + * can track switches in between MI_REPORT_PERF_COUNT commands and can + * itself subtract/ignore the progress of counters associated with other + * contexts. Note that the hardware automatically triggers reports when + * switching to a new context which are tagged with the ID of the newly + * active context. To avoid the complexity of reading ahead while parsing + * reports to try and minimize forwarding redundant context switch reports + * (i.e. between other, unrelated contexts) we simply elect to forward + * them all. + * + * We don't rely solely on the reason field to identify context switches + * since it's not-uncommon for periodic samples to identify a switch + * before any 'context switch' report. + */ + if (!stream->exec_q || stream->specific_ctx_id == ctx_id || + stream->oa_buffer.last_ctx_id == stream->specific_ctx_id || + reason & OAREPORT_REASON_CTX_SWITCH) { + /* + * While filtering for a single context we avoid + * leaking the IDs of other contexts. + */ + if (stream->exec_q && stream->specific_ctx_id != ctx_id) + oa_context_id_squash(stream, report32); + + ret = xe_oa_append_sample(stream, buf, count, offset, report); + if (ret) + break; + + stream->oa_buffer.last_ctx_id = ctx_id; + } + + if (is_power_of_2(report_size)) { + /* + * Clear out report id and timestamp as a means to + * detect unlanded reports. + */ + oa_report_id_clear(stream, report32); + oa_timestamp_clear(stream, report32); + } else { + u8 *oa_buf_end = stream->oa_buffer.vaddr + + OA_BUFFER_SIZE; + u32 part = oa_buf_end - (u8 *)report32; + + /* Zero out the entire report */ + if (report_size <= part) { + memset(report32, 0, report_size); + } else { + memset(report32, 0, part); + memset(oa_buf_base, 0, report_size - part); + } + } + } + + if (start_offset != *offset) { + struct xe_reg oaheadptr = __oa_regs(stream)->oa_head_ptr; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + xe_mmio_write32(stream->gt, oaheadptr, + (head + gtt_offset) & GEN12_OAG_OAHEADPTR_MASK); + stream->oa_buffer.head = head; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + } + + return ret; +} + static void xe_oa_init_oa_buffer(struct xe_oa_stream *stream) { u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); @@ -305,6 +544,125 @@ static void xe_oa_disable(struct xe_oa_stream *stream) "wait for OA tlb invalidate timed out\n"); } +static int __xe_oa_read(struct xe_oa_stream *stream, char __user *buf, + size_t count, size_t *offset) +{ + struct xe_reg oastatus_reg = __oa_regs(stream)->oa_status; + u32 oastatus; + int ret; + + if (drm_WARN_ON(&stream->oa->xe->drm, !stream->oa_buffer.vaddr)) + return -EIO; + + oastatus = xe_mmio_read32(stream->gt, oastatus_reg); + + /* + * We treat OABUFFER_OVERFLOW as a significant error: + * + * We could handle this more gracefully, but some Gens don't correctly suppress + * certain automatically triggered reports in this condition and so we have to + * assume that old reports are now being trampled over. + * + * Considering how we don't currently give userspace control over the OA buffer + * size and always configure a large 16MB buffer, then a buffer overflow does + * anyway likely indicate that something has gone quite badly wrong. + */ + if (oastatus & GEN12_OAG_OASTATUS_BUFFER_OVERFLOW) { + ret = xe_oa_append_status(stream, buf, count, offset, + DRM_XE_OA_RECORD_OA_BUFFER_LOST); + if (ret) + return ret; + + drm_dbg(&stream->oa->xe->drm, + "OA buffer overflow (exponent = %d): force restart\n", + stream->period_exponent); + + xe_oa_disable(stream); + xe_oa_enable(stream); + + /* + * Note: oa_enable is expected to re-init the oabuffer and reset + * oastatus_reg for us + */ + oastatus = xe_mmio_read32(stream->gt, oastatus_reg); + } + + if (oastatus & GEN12_OAG_OASTATUS_REPORT_LOST) { + ret = xe_oa_append_status(stream, buf, count, offset, + DRM_XE_OA_RECORD_OA_REPORT_LOST); + if (ret) + return ret; + + xe_mmio_rmw32(stream->gt, oastatus_reg, + GEN12_OAG_OASTATUS_COUNTER_OVERFLOW | + GEN12_OAG_OASTATUS_REPORT_LOST, 0); + } + + return xe_oa_append_reports(stream, buf, count, offset); +} + +static int xe_oa_wait_unlocked(struct xe_oa_stream *stream) +{ + /* We might wait indefinitely if periodic sampling is not enabled */ + if (!stream->periodic) + return -EIO; + + return wait_event_interruptible(stream->poll_wq, + xe_oa_buffer_check_unlocked(stream)); +} + +static ssize_t xe_oa_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct xe_oa_stream *stream = file->private_data; + size_t offset = 0; + int ret; + + /* Can't read from disabled streams */ + if (!stream->enabled || !stream->sample) + return -EIO; + + if (!(file->f_flags & O_NONBLOCK)) { + /* + * There's the small chance of false positives from wait_unlocked, + * e.g. with single engine filtering since we only wait until oabuffer + * has >= 1 report we don't immediately know whether any reports really + * belong to the current engine. + */ + do { + ret = xe_oa_wait_unlocked(stream); + if (ret) + return ret; + + mutex_lock(&stream->lock); + ret = __xe_oa_read(stream, buf, count, &offset); + mutex_unlock(&stream->lock); + } while (!offset && !ret); + } else { + mutex_lock(&stream->lock); + ret = __xe_oa_read(stream, buf, count, &offset); + mutex_unlock(&stream->lock); + } + + /* + * We allow the poll checking to sometimes report false positive EPOLLIN + * events where we might actually report EAGAIN on read() if there's + * not really any data available. In this situation though we don't + * want to enter a busy loop between poll() reporting a EPOLLIN event + * and read() returning -EAGAIN. Clearing the oa.pollin state here + * effectively ensures we back off until the next hrtimer callback + * before reporting another EPOLLIN event. + * The exception to this is if __xe_oa_read returned -ENOSPC which means + * that more OA data is available than could fit in the user provided + * buffer. In this case we want the next poll() call to not block. + */ + if (ret != -ENOSPC) + stream->pollin = false; + + /* Possible values for ret are 0, -EFAULT, -ENOSPC, -EIO, ... */ + return offset ?: (ret ?: -EAGAIN); +} + static __poll_t xe_oa_poll_locked(struct xe_oa_stream *stream, struct file *file, poll_table *wait) { @@ -880,6 +1238,7 @@ static const struct file_operations xe_oa_fops = { .llseek = no_llseek, .release = xe_oa_release, .poll = xe_oa_poll, + .read = xe_oa_read, .unlocked_ioctl = xe_oa_ioctl, }; From patchwork Tue Aug 8 01:21:54 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [10/11] drm/xe/oa: Implement queries From: Ashutosh Dixit X-Patchwork-Id: 551684 Message-Id: <20230808012155.38531-11-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:54 -0700 Implement queries to query OA unit ID's for HW engines, OA timestamp freq and OA ioctl version. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_oa.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_oa.h | 3 +++ drivers/gpu/drm/xe/xe_query.c | 5 ++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index fcbb352f36a48..f320e5cd76a13 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -2219,6 +2219,12 @@ static int xe_oa_init_engine_groups(struct xe_oa *oa) return 0; } +u32 xe_oa_unit_id(struct xe_hw_engine *hwe) +{ + return hwe->oa_group && hwe->oa_group->num_engines ? + hwe->oa_group->oa_unit_id : U32_MAX; +} + static void oa_format_add(struct xe_oa *oa, enum drm_xe_oa_format format) { __set_bit(format, oa->format_mask); @@ -2333,6 +2339,11 @@ static struct ctl_table oa_ctl_table[] = { {} }; +int xe_oa_ioctl_version(struct xe_device *xe) +{ + return 1; +} + int xe_oa_sysctl_register(void) { sysctl_header = register_sysctl("dev/xe", oa_ctl_table); diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h index fd6caf652047a..41a7d8b0f10e1 100644 --- a/drivers/gpu/drm/xe/xe_oa.h +++ b/drivers/gpu/drm/xe/xe_oa.h @@ -22,4 +22,7 @@ int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, struct drm_file *file); int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, struct drm_file *file); +u32 xe_oa_timestamp_frequency(struct xe_device *xe); +u32 xe_oa_unit_id(struct xe_hw_engine *hwe); + #endif diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c index 7ea235c71385f..3dbc4bbae4d02 100644 --- a/drivers/gpu/drm/xe/xe_query.c +++ b/drivers/gpu/drm/xe/xe_query.c @@ -78,7 +78,8 @@ static int query_engines(struct xe_device *xe, xe_to_user_engine_class[hwe->class]; hw_engine_info[i].engine_instance = hwe->logical_instance; - hw_engine_info[i++].gt_id = gt->info.id; + hw_engine_info[i].gt_id = gt->info.id; + hw_engine_info[i++].oa_unit_id = xe_oa_unit_id(hwe); } if (copy_to_user(query_ptr, hw_engine_info, size)) { @@ -204,6 +205,7 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query) hweight_long(xe->info.mem_region_mask); config->info[XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY] = xe_exec_queue_device_get_max_priority(xe); + config->info[XE_QUERY_OA_IOCTL_VERSION] = xe_oa_ioctl_version(xe); if (copy_to_user(query_ptr, config, size)) { kfree(config); @@ -245,6 +247,7 @@ static int query_gts(struct xe_device *xe, struct drm_xe_device_query *query) gts->gts[id].type = XE_QUERY_GT_TYPE_MAIN; gts->gts[id].instance = id; gts->gts[id].clock_freq = gt->info.clock_freq; + gts->gts[id].oa_timestamp_freq = xe_oa_timestamp_frequency(xe); if (!IS_DGFX(xe)) gts->gts[id].native_mem_regions = 0x1; else From patchwork Tue Aug 8 01:21:55 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [11/11] HAX: drm/xe/oa: Incomplete features and FIXME's From: Ashutosh Dixit X-Patchwork-Id: 551683 Message-Id: <20230808012155.38531-12-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Mon, 7 Aug 2023 18:21:55 -0700 This last HAX patch is added as a help to reviewing the Xe OA patchset. The following i915 features have not been included in the Xe OA patchset: * Inline batch submission on stream exec_queue/hw_engine * NOA wait * GuC ctx id (guc_sw_ctx_id) * CTX_R_PWR_CLK_STATE/GEN8_R_PWR_CLK_STATE * hold_preemption (DRM_XE_OA_PROP_HOLD_PREEMPTION) * sseu_config (DRM_XE_OA_PROP_GLOBAL_SSEU) * Override gucrc (override_gucrc_mode) * MTL bios_c6_setup * ratelimits * compat ioctl Theis HAX patch contains: a. Incomplete ports of the features listed above which are not included in the Xe OA patchset b. FIXME highlighting significant changes between i915 and xe, and c. FIXME containing author comments about implementation caveats Therefore FIXME's (and associated comments) in this HAX patch should guide in reviewing the Xe OA patchset. Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/xe/xe_oa.c | 783 +++++++++++++++++++++++++++++-- drivers/gpu/drm/xe/xe_oa.h | 7 + drivers/gpu/drm/xe/xe_oa_types.h | 65 +++ include/uapi/drm/xe_drm.h | 2 + 4 files changed, 828 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index f320e5cd76a13..3b43b91a79a65 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -3,6 +3,12 @@ * Copyright © 2023 Intel Corporation */ +/* + * Current list of features missing in xe kmd: + * - get_default_sseu_config + * - xe_engine_set_nopreempt + */ + #include #include #include @@ -31,6 +37,8 @@ #include "xe_sched_job.h" #include "xe_vm.h" +#define __UNUSED__ __attribute__((unused)) + #define OA_BUFFER_SIZE SZ_16M #define OA_TAKEN(tail, head) (((tail) - (head)) & (OA_BUFFER_SIZE - 1)) #define DEFAULT_POLL_FREQUENCY_HZ 200 @@ -59,15 +67,18 @@ static const struct xe_oa_format oa_formats[] = { }; struct xe_oa_open_properties { - bool sample; - bool single_exec_q; - u64 exec_q_id; + bool sample; // FIXME: previously sample_flags, changed to bool + bool single_exec_q; // FIXME: single_context + u64 exec_q_id; // FIXME: ctx_handle + bool hold_preemption; int metrics_set; int oa_format; bool oa_periodic; int oa_period_exponent; + // struct intel_sseu sseu; // FIXME: support in xe kmd? + struct xe_hw_engine *hwe; u64 poll_oa_period; @@ -77,7 +88,7 @@ struct xe_oa_config_bo { struct llist_node node; struct xe_oa_config *oa_config; - struct xe_bb *bb; + struct xe_bb *bb; // FIXME: check }; static struct ctl_table_header *sysctl_header; @@ -250,7 +261,8 @@ static bool xe_oa_buffer_check_unlocked(struct xe_oa_stream *stream) tail = OA_TAKEN(tail, report_size); } - if (OA_TAKEN(hw_tail, tail) > report_size) + if (OA_TAKEN(hw_tail, tail) > report_size && + __ratelimit(&stream->oa->tail_pointer_race)) drm_dbg(&stream->oa->xe->drm, "unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n", stream->oa_buffer.head, tail, hw_tail); @@ -419,14 +431,15 @@ static int xe_oa_append_reports(struct xe_oa_stream *stream, char __user *buf, * since it's not-uncommon for periodic samples to identify a switch * before any 'context switch' report. */ - if (!stream->exec_q || stream->specific_ctx_id == ctx_id || + if (!stream->exec_q || // FIXME: check + stream->specific_ctx_id == ctx_id || stream->oa_buffer.last_ctx_id == stream->specific_ctx_id || reason & OAREPORT_REASON_CTX_SWITCH) { /* * While filtering for a single context we avoid * leaking the IDs of other contexts. */ - if (stream->exec_q && stream->specific_ctx_id != ctx_id) + if (stream->exec_q && stream->specific_ctx_id != ctx_id) // FIXME: check oa_context_id_squash(stream, report32); ret = xe_oa_append_sample(stream, buf, count, offset, report); @@ -694,6 +707,94 @@ static __poll_t xe_oa_poll(struct file *file, poll_table *wait) return ret; } +#if 0 +// If this is needed need to look into further +static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) +{ + struct xe_hw_engine *hwe = stream->hwe; + struct xe_engine *e; + struct xe_sched_job *job; + struct dma_fence *fence; + struct xe_vm *vm; + u64 batch_ofs; + long timeout; + int err = 0; + + if (stream->engine) { + /* + * FIXME: can we send kernel bb in e->vm context? Seems to be + * causing big problems (cat err) which need to be investigated + */ + e = stream->engine; + XE_BUG_ON(!e->vm); + err = dma_resv_lock_interruptible(&e->vm->resv, NULL); + if (err) + goto exit; + down_write(&e->vm->lock); + job = xe_bb_create_job(e, bb); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto vm_unlock; + } + } else { + vm = xe_migrate_get_vm(stream->gt->tile->migrate); + e = xe_engine_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1, + hwe, ENGINE_FLAG_WA); + if (IS_ERR(e)) { + err = PTR_ERR(e); + drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_engine_create,e failed=%d", + stream->gt->info.id, hwe->name, err); + goto put_vm; + } + + batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo); + /* Will add MI_BATCH_BUFFER_END */ + job = xe_bb_create_wa_job(e, bb, batch_ofs); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto put_engine; + } + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + if (timeout < 0) + err = timeout; + else if (!timeout) + err = -ETIME; +put_engine: + if (!stream->engine) + xe_engine_put(e); +put_vm: + if (!stream->engine) + xe_vm_put(vm); +vm_unlock: + if (stream->engine) { + dma_resv_unlock(&e->vm->resv); + up_write(&e->vm->lock); + } +exit: + return err; +} +#endif + +/* + FIXME: Currently submits only to stream->engine or new engine for + stream->hwe. If needed, add 'struct xe_engine *' argument + + For now unconditionally create engine otherwise we hit BUG_ON in + xe_bb_create_wa_job. If jobs need to be sent to the same engine for + serialization may need to replace xe_bb_create_wa_job with a similar + function. + + Also the code is wrong for xe_oa_guc_sw_ctx_id because there we need to + submit against the real engine/context rather than the new engine created + below. +*/ static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) { struct xe_hw_engine *hwe = stream->hwe; @@ -706,14 +807,16 @@ static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) int err = 0; vm = xe_migrate_get_vm(stream->gt->tile->migrate); - q = xe_exec_queue_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1, - hwe, EXEC_QUEUE_FLAG_WA); - if (IS_ERR(q)) { - err = PTR_ERR(q); - drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_exec_queue_create failed=%d", - stream->gt->info.id, hwe->name, err); - goto put_vm; - } + // if (!stream->exec_q) { + q = xe_exec_queue_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1, + hwe, EXEC_QUEUE_FLAG_WA); + if (IS_ERR(q)) { + err = PTR_ERR(q); + drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_exec_queue_create failed=%d", + stream->gt->info.id, hwe->name, err); + goto put_vm; + } + // } batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo); /* Will add MI_BATCH_BUFFER_END */ @@ -734,7 +837,8 @@ static int xe_oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) else if (!timeout) err = -ETIME; put_exec_q: - xe_exec_queue_put(q); + // if (!stream->exec_q) + xe_exec_queue_put(q); put_vm: xe_vm_put(vm); @@ -750,11 +854,17 @@ static void xe_oa_free_configs(struct xe_oa_stream *stream) { struct xe_oa_config_bo *oa_bo, *tmp; + // FIXME: check functions below xe_oa_config_put(stream->oa_config); llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node) free_oa_config_bo(oa_bo); } +static void xe_oa_free_noa_wait(struct xe_oa_stream *stream) +{ + xe_bo_unpin_map_no_vm(stream->noa_wait); +} + static void xe_oa_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, struct xe_bb *bb, const struct flex *flex, u32 count) { @@ -825,6 +935,90 @@ static int xe_oa_modify_self(struct xe_oa_stream *stream, return err; } +static int xe_oa_configure_context(struct xe_oa_stream *stream, + struct xe_exec_queue *q, + struct flex *flex, u32 count) +{ + int i, err = 0; + + for (i = 0; i < q->width; i++) { + // flex->value = intel_sseu_make_rpcs(ce->engine->gt, &ce->sseu); // FIXME + err = xe_oa_modify_context(stream, &q->lrc[i], flex, count); + if (err) + break; + } + + return err; +} + +static int __xe_oa_configure_all_contexts(struct xe_oa_stream *stream, + struct flex *regs, + size_t num_regs, bool enable) +{ + struct xe_file *xef = stream->xef; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_exec_queue *q; + unsigned long idx; + int err; + + // FIXME: below crashes during close, need to check xef mutex + return 0; + + // FIXME: we can't use xef to find all engines since there may be multiple such files + mutex_lock(&xef->exec_queue.lock); + xa_for_each(&xef->exec_queue.xa, idx, q) { + xe_exec_queue_get(q); + err = xe_oa_configure_context(stream, q, regs, num_regs); + xe_exec_queue_put(q); + if (err) + return err; + } + mutex_unlock(&xef->exec_queue.lock); + + /* + * After updating all other contexts, we need to modify ourselves. If + * we don't modify the kernel_context, we do not get events while idle. + */ + for_each_hw_engine(hwe, stream->gt, id) { + /* + * FIXME: at present there is no way to create an engine using + * hwe->kernel_lrc. Also in xe we don't use kernel_lrc when idle, + * though we would need a 'context' restored to get events when idle + * to make sure registers are programmed correctly. + */ + } + + return 0; +} + +static __UNUSED__ int +lrc_configure_all_contexts(struct xe_oa_stream *stream, + const struct xe_oa_config *oa_config) +{ + return 0; // FIXME: not used for gen12+ +} + +static int xe_oa_configure_all_contexts(struct xe_oa_stream *stream, bool enable) +{ +#define GEN8_R_PWR_CLK_STATE(base) XE_REG((base) + 0xc8) +#define CTX_R_PWR_CLK_STATE (0x42 + 1) + + struct flex regs[] = { + { + GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE), + CTX_R_PWR_CLK_STATE, + }, + }; + + if (stream->hwe->class != XE_ENGINE_CLASS_RENDER) + return 0; + + // FIXME: what should this do when enable == false? + + return __xe_oa_configure_all_contexts(stream, regs, ARRAY_SIZE(regs), enable); +} + static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) { int err; @@ -857,7 +1051,10 @@ static int xe_oa_configure_oar_context(struct xe_oa_stream *stream, bool enable) }, }; - /* Modify stream hwe context image with regs_context */ + /* Modify stream hwe context image with regs_context + * FIXME: for now only modifying engine->lrc[0], but maybe this should + * be changed to modify all lrc's underlying the engine? + */ err = xe_oa_modify_context(stream, &stream->exec_q->lrc[0], regs_context, ARRAY_SIZE(regs_context)); if (err) @@ -884,6 +1081,9 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream) _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); } + /* Reset all contexts' slices/subslices configurations. */ + xe_oa_configure_all_contexts(stream, false); + /* disable the context save/restore or OAR counters */ if (stream->exec_q) xe_oa_configure_oar_context(stream, false); @@ -898,10 +1098,21 @@ static void xe_oa_disable_metric_set(struct xe_oa_stream *stream) xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, sqcnt1, 0); } +static int intel_guc_slpc_override_gucrc_mode(struct xe_gt *gt, u32 mode) +{ + return 0; // FIXME +} + +static int intel_guc_slpc_unset_gucrc_mode(struct xe_gt *gt) +{ + return 0; // FIXME +} + static void xe_oa_stream_destroy(struct xe_oa_stream *stream) { struct xe_oa_group *g = stream->hwe->oa_group; struct xe_gt *gt = stream->hwe->gt; + struct xe_oa *oa = stream->oa; if (WARN_ON(stream != g->exclusive_stream)) return; @@ -912,10 +1123,21 @@ static void xe_oa_stream_destroy(struct xe_oa_stream *stream) xe_oa_free_oa_buffer(stream); + /* Wa_16011777198:dg2: Unset the override of GUCRC mode to enable rc6 */ + if (stream->override_gucrc) + drm_WARN_ON(&stream->oa->xe->drm, intel_guc_slpc_unset_gucrc_mode(gt)); + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); xe_device_mem_access_put(stream->oa->xe); xe_oa_free_configs(stream); + xe_oa_free_noa_wait(stream); + + if (oa->spurious_report_rs.missed) { + drm_notice(&stream->oa->xe->drm, + "%d spurious OA report notices suppressed due to ratelimiting\n", + oa->spurious_report_rs.missed); + } } static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream) @@ -937,6 +1159,197 @@ static int xe_oa_alloc_oa_buffer(struct xe_oa_stream *stream) return 0; } +static u32 *save_restore_register(struct xe_oa_stream *stream, u32 *cs, + bool save, struct xe_reg reg, u32 offset, + u32 dword_count) +{ + u32 cmd; + u32 d; + + cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM; + cmd |= MI_SRM_LRM_GLOBAL_GTT; + cmd++; + + for (d = 0; d < dword_count; d++) { + *cs++ = cmd; + *cs++ = reg.addr + 4 * d; + *cs++ = xe_bo_ggtt_addr(stream->noa_wait) + offset + 4 * d; + *cs++ = 0; + } + + return cs; +} + +static u64 xe_oa_ns_to_clock_interval(const struct xe_gt *gt, u64 ns) +{ + return DIV64_U64_ROUND_UP(gt->info.clock_freq * ns, NSEC_PER_SEC); +} + +static int xe_oa_alloc_noa_wait(struct xe_oa_stream *stream) +{ + struct xe_bo *bo; + const u64 delay_ticks = 0xffffffffffffffff - + xe_oa_ns_to_clock_interval(stream->gt, + atomic64_read(&stream->oa->noa_programming_delay)); + const u32 base = stream->hwe->mmio_base; +#define HAS_MI_SET_PREDICATE(xe) (GRAPHICS_VERx100(xe) >= 1270) +#define CS_GPR(x) GEN8_RING_CS_GPR(base, x) + u32 *batch, *ts0, *cs, *jump; + int ret, i; + enum { + START_TS, + NOW_TS, + DELTA_TS, + JUMP_PREDICATE, + DELTA_TARGET, + N_CS_GPR + }; + struct xe_reg mi_predicate_result = HAS_MI_SET_PREDICATE(stream->gt->tile->xe) ? + MI_PREDICATE_RESULT_2(base) : + MI_PREDICATE_RESULT_1(RENDER_RING_BASE); + + bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL, + 8192, ttm_bo_type_kernel, + // XE_BO_CREATE_VRAM_IF_DGFX(gt) | + XE_BO_CREATE_SYSTEM_BIT | // FIXME: check + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + batch = cs = bo->vmap.is_iomem ? bo->vmap.vaddr_iomem : bo->vmap.vaddr; + stream->noa_wait = bo; + +#define GPR_SAVE_OFFSET 4096 +#define PREDICATE_SAVE_OFFSET 4160 + + /* Save registers. */ + for (i = 0; i < N_CS_GPR; i++) + cs = save_restore_register( + stream, cs, true /* save */, CS_GPR(i), + GPR_SAVE_OFFSET + 8 * i, 2); + cs = save_restore_register( + stream, cs, true /* save */, mi_predicate_result, + PREDICATE_SAVE_OFFSET, 1); + + /* First timestamp snapshot location. */ + ts0 = cs; + + /* + * Initial snapshot of the timestamp register to implement the wait. + * We work with 32b values, so clear out the top 32b bits of the + * register because the ALU works 64bits. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = CS_GPR(START_TS).addr + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = RING_TIMESTAMP(base).addr; + *cs++ = CS_GPR(START_TS).addr; + + /* + * This is the location we're going to jump back into until the + * required amount of time has passed. + */ + jump = cs; + + /* + * Take another snapshot of the timestamp register. Take care to clear + * up the top 32bits of CS_GPR(1) as we're using it for other + * operations below. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = CS_GPR(NOW_TS).addr + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = RING_TIMESTAMP(base).addr; + *cs++ = CS_GPR(NOW_TS).addr; + + /* + * Do a diff between the 2 timestamps and store the result back into + * CS_GPR(1). + */ + *cs++ = MI_MATH(5); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS)); + *cs++ = MI_MATH_SUB; + *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU); + *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF); + + /* + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the + * timestamp have rolled over the 32bits) into the predicate register + * to be used for the predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = CS_GPR(JUMP_PREDICATE).addr; + *cs++ = mi_predicate_result.addr; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE | 1; + + /* Restart from the beginning if we had timestamps roll over. */ + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_PREDICATE; + // *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; // FIXME + *cs++ = 0; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE; + + /* + * Now add the diff between to previous timestamps and add it to : + * (((1 * << 64) - 1) - delay_ns) + * + * When the Carry Flag contains 1 this means the elapsed time is + * longer than the expected delay, and we can exit the wait loop. + */ + *cs++ = MI_LOAD_REGISTER_IMM(2); + *cs++ = CS_GPR(DELTA_TARGET).addr; + *cs++ = lower_32_bits(delay_ticks); + *cs++ = CS_GPR(DELTA_TARGET).addr + 4; + *cs++ = upper_32_bits(delay_ticks); + + *cs++ = MI_MATH(4); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET)); + *cs++ = MI_MATH_ADD; + *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF); + + *cs++ = MI_ARB_CHECK; + + /* + * Transfer the result into the predicate register to be used for the + * predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = CS_GPR(JUMP_PREDICATE).addr; + *cs++ = mi_predicate_result.addr; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE | 1; + + /* Predicate the jump. */ + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_PREDICATE; + // *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; // FIXME + *cs++ = 0; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE; + + /* Restore registers. */ + for (i = 0; i < N_CS_GPR; i++) + cs = save_restore_register( + stream, cs, false /* restore */, CS_GPR(i), + GPR_SAVE_OFFSET + 8 * i, 2); + cs = save_restore_register( + stream, cs, false /* restore */, mi_predicate_result, + PREDICATE_SAVE_OFFSET, 1); + + /* And return to the ring. */ + *cs++ = MI_BATCH_BUFFER_END; + + return ret; +} + static void write_cs_mi_lri(struct xe_bb *bb, const struct xe_oa_reg *reg_data, u32 n_regs) { u32 i; @@ -981,7 +1394,11 @@ __xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa config_length += num_lri_dwords(oa_config->mux_regs_len); config_length += num_lri_dwords(oa_config->b_counter_regs_len); config_length += num_lri_dwords(oa_config->flex_regs_len); +#if 1 // FIXME: noa_wait (see 93937659dc64) config_length++; /* MI_BATCH_BUFFER_END */ +#else + config_length += 4; /* MI_BATCH_BUFFER_START */ +#endif config_length = ALIGN(sizeof(u32) * config_length, XE_PAGE_SIZE) / sizeof(u32); bb = xe_bb_new(stream->gt, config_length, false); @@ -992,6 +1409,15 @@ __xe_oa_alloc_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa write_cs_mi_lri(bb, oa_config->b_counter_regs, oa_config->b_counter_regs_len); write_cs_mi_lri(bb, oa_config->flex_regs, oa_config->flex_regs_len); +#if 0 // FIXME: noa_wait (see 93937659dc64) + // xe_bb_create_job adds MI_BATCH_BUFFER_END + // TBD: how to handle noa_wait in xe_bb_create_job + + /* Jump into the active wait. */ + bb->cs[bb->len++] = MI_BATCH_BUFFER_START; + bb->cs[bb->len++] = xe_bo_ggtt_addr(stream->noa_wait); + bb->cs[bb->len++] = 0; +#endif oa_bo->bb = bb; oa_bo->oa_config = xe_oa_config_get(oa_config); llist_add(&oa_bo->node, &stream->oa_config_bos); @@ -1020,6 +1446,7 @@ static struct xe_oa_config_bo *xe_oa_alloc_config_buffer(struct xe_oa_stream *st return oa_bo; } +// FIXME: check entire function and called functions static int xe_oa_emit_oa_config(struct xe_oa_stream *stream) { struct xe_oa_config_bo *oa_bo; @@ -1036,6 +1463,32 @@ static int xe_oa_emit_oa_config(struct xe_oa_stream *stream) return err; } +static __UNUSED__ void oa_context(struct xe_oa_stream *stream) {} + +static __UNUSED__ u32 oa_config_flex_reg(const struct xe_oa_config *oa_config, + struct xe_reg reg) +{ + u32 mmio = reg.addr; + int i; + + /* + * This arbitrary default will select the 'EU FPU0 Pipeline + * Active' event. In the future it's anticipated that there + * will be an explicit 'No Event' we can select, but not yet... + */ + if (!oa_config) + return 0; + + for (i = 0; i < oa_config->flex_regs_len; i++) { + if (oa_config->flex_regs[i].addr.addr == mmio) + return oa_config->flex_regs[i].value; + } + + return 0; +} + +static __UNUSED__ void gen8_update_reg_state_unlocked(const struct xe_oa_stream *stream) {} + static u32 oag_report_ctx_switches(const struct xe_oa_stream *stream) { return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS, @@ -1054,7 +1507,7 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) * Disable thread stall DOP gating and EU DOP gating. */ if (stream->gt->tile->xe->info.platform == XE_DG2) { - xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN, + xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN, // FIXME: check _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); xe_mmio_write32(stream->gt, GEN7_ROW_CHICKEN2, _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); @@ -1086,6 +1539,15 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, 0, sqcnt1); + /* FIXME: do this later if needed + * + * Update all contexts prior writing the mux configurations as we need to make + * sure all slices/subslices are ON before writing to NOA registers. + */ + ret = xe_oa_configure_all_contexts(stream, true); + if (ret) + return ret; + /* * For Gen12, performance counters are context saved/restored. Only enable it * for the context that requested this. @@ -1099,6 +1561,19 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) return xe_oa_emit_oa_config(stream); } +static __UNUSED__ void get_default_sseu_config(void) {} +static __UNUSED__ void get_sseu_config(void) {} + +static void xe_engine_set_nopreempt(struct xe_exec_queue *q) +{ + // FIXME +} + +static void xe_engine_clear_nopreempt(struct xe_exec_queue *q) +{ + // FIXME +} + static void xe_oa_stream_enable(struct xe_oa_stream *stream) { stream->pollin = false; @@ -1127,6 +1602,9 @@ static void xe_oa_enable_locked(struct xe_oa_stream *stream) stream->enabled = true; xe_oa_stream_enable(stream); + + if (stream->hold_preemption) + xe_engine_set_nopreempt(stream->exec_q); } static void xe_oa_disable_locked(struct xe_oa_stream *stream) @@ -1136,6 +1614,9 @@ static void xe_oa_disable_locked(struct xe_oa_stream *stream) stream->enabled = false; + if (stream->hold_preemption) + xe_engine_clear_nopreempt(stream->exec_q); + xe_oa_stream_disable(stream); } @@ -1152,6 +1633,7 @@ static long xe_oa_config_locked(struct xe_oa_stream *stream, if (config != stream->oa_config) { int err; + // FIXME: check: does the config have to be emitted on the stream engine? /* * If OA is bound to a specific engine, emit the reconfiguration * inline from that engine. The update will then be ordered with @@ -1209,7 +1691,7 @@ static void xe_oa_destroy_locked(struct xe_oa_stream *stream) xe_oa_stream_destroy(stream); if (stream->exec_q) - xe_exec_queue_put(stream->exec_q); + xe_exec_queue_put(stream->exec_q); // FIXME: check kfree(stream); } @@ -1240,6 +1722,8 @@ static const struct file_operations xe_oa_fops = { .poll = xe_oa_poll, .read = xe_oa_read, .unlocked_ioctl = xe_oa_ioctl, + // FIXME: check .compat_ioctl later, maybe skip for now + .compat_ioctl = xe_oa_ioctl, }; static bool engine_supports_mi_query(struct xe_hw_engine *hwe) @@ -1247,6 +1731,11 @@ static bool engine_supports_mi_query(struct xe_hw_engine *hwe) return hwe->class == XE_ENGINE_CLASS_RENDER; } +static void xe_oa_pin_context(struct xe_oa_stream *stream) +{ + // contexts are already pinned for now, there's no unpin +} + #define MI_LRI_LEN(x) (((x) & 0xff) + 1) static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) @@ -1269,10 +1758,17 @@ static bool xe_oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) static u32 xe_oa_context_image_offset(struct xe_oa_stream *stream, u32 reg) { + // FIXME: check len and state assignments below u32 len = (xe_lrc_size(stream->gt->tile->xe, stream->hwe->class) - PAGE_SIZE) / 4; u32 *state = stream->gt->default_lrc[stream->hwe->class]; u32 offset; + /* + * FIXME: maybe ok but really __xe_lrc_regs_offset should be added to + * state. The same offset should be used in xe_oa_configure_oar_context + * where ctx_oactxctrl_offset is consumed. Also instead of default_lrc + * we could use stream->engine->lrc or stream->hwe->kernel_lrc + */ if (drm_WARN_ON(&stream->oa->xe->drm, !state)) return U32_MAX; @@ -1313,6 +1809,126 @@ static int xe_oa_set_ctx_ctrl_offset(struct xe_oa_stream *stream) return offset && offset != U32_MAX ? 0 : -ENODEV; } +static void __store_reg_to_mem(struct xe_bb *bb, struct xe_reg reg, u32 ggtt_offset) +{ + u32 cmd; + + cmd = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + cmd++; + + bb->cs[bb->len++] = cmd; + bb->cs[bb->len++] = reg.addr; + bb->cs[bb->len++] = ggtt_offset; + bb->cs[bb->len++] = 0; +} + +static int __read_reg(struct xe_oa_stream *stream, struct xe_reg reg, u32 ggtt_offset) +{ + struct xe_bb *bb; + int err = 0; + + bb = xe_bb_new(stream->gt, 4 + 1, false); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto exit; + } + + __store_reg_to_mem(bb, reg, ggtt_offset); + + err = xe_oa_submit_bb(stream, bb); + xe_bb_free(bb, NULL); +exit: + return err; +} + +static int xe_oa_guc_sw_ctx_id(struct xe_oa_stream *stream, u32 *ctx_id) +{ + struct xe_bo *bo; + u32 *ptr; + int err = 0; + + bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL, + 4096, ttm_bo_type_kernel, + // XE_BO_CREATE_VRAM_IF_DGFX(gt) | + XE_BO_CREATE_SYSTEM_BIT | // FIXME: check + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) { + err = PTR_ERR(bo); + goto exit; + } + + err = __read_reg(stream, RING_EXECLIST_STATUS_HI(stream->hwe->mmio_base), + xe_bo_ggtt_addr(bo)); + if (err) + goto unpin; + + ptr = bo->vmap.is_iomem ? bo->vmap.vaddr_iomem : bo->vmap.vaddr; + + *ctx_id = *ptr; +unpin: + xe_bo_unpin_map_no_vm(bo); +exit: + return err; +} + +static int __xe_oa_get_render_context_id(struct xe_oa_stream *stream) +{ + u32 ctx_id, mask; + int ret; + +// FIXME: only retain the GuC case here, we only support GuC + +#define GEN12_GUC_SW_CTX_ID_MASK GENMASK(22, 7) + +#define GEN11_SW_CTX_ID_SHIFT 37 +#define GEN11_SW_CTX_ID_WIDTH 11 +#define XEHP_SW_CTX_ID_SHIFT 39 +#define XEHP_SW_CTX_ID_WIDTH 16 +#define XEHP_SW_COUNTER_SHIFT 58 +#define XEHP_SW_COUNTER_WIDTH 6 +#define MAX_CONTEXT_HW_ID (1 << 21) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID (1 << 11) /* exclusive */ +/* in Gen12 ID 0x7FF is reserved to indicate idle */ +#define GEN12_MAX_CONTEXT_HW_ID (GEN11_MAX_CONTEXT_HW_ID - 1) +/* in Xe_HP ID 0xFFFF is reserved to indicate "invalid context" */ +#define XEHP_MAX_CONTEXT_HW_ID 0xFFFF + + if (xe_device_guc_submission_enabled(stream->gt->tile->xe)) { + ret = xe_oa_guc_sw_ctx_id(stream, &ctx_id); + if (ret) + return ret; + + mask = GEN12_GUC_SW_CTX_ID_MASK; + } else if (GRAPHICS_VERx100(stream->gt->tile->xe) >= 1250) { + ctx_id = (XEHP_MAX_CONTEXT_HW_ID - 1) << + (XEHP_SW_CTX_ID_SHIFT - 32); + + mask = ((1U << XEHP_SW_CTX_ID_WIDTH) - 1) << + (XEHP_SW_CTX_ID_SHIFT - 32); + } else { + ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << + (GEN11_SW_CTX_ID_SHIFT - 32); + + mask = ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << + (GEN11_SW_CTX_ID_SHIFT - 32); + } + stream->specific_ctx_id = ctx_id & mask; + stream->specific_ctx_id_mask = mask; + + return 0; +} + +static int xe_oa_get_render_ctx_id(struct xe_oa_stream *stream) +{ + int ret = __xe_oa_get_render_context_id(stream); + + drm_dbg(&stream->oa->xe->drm, + "filtering on ctx_id=0x%x ctx_id_mask=0x%x\n", + stream->specific_ctx_id, stream->specific_ctx_id_mask); + + return ret; +} + static int xe_oa_stream_init(struct xe_oa_stream *stream, struct xe_oa_open_properties *props) { @@ -1329,25 +1945,43 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, stream->sample = props->sample; stream->sample_size += stream->oa_buffer.format->size; + stream->hold_preemption = props->hold_preemption; stream->periodic = props->oa_periodic; stream->period_exponent = props->oa_period_exponent; - if (stream->exec_q && engine_supports_mi_query(stream->hwe)) { - /* If we don't find the context offset, just return error */ - ret = xe_oa_set_ctx_ctrl_offset(stream); + if (stream->exec_q) { + xe_oa_pin_context(stream); // FIXME: empty function, no unpin + + if (engine_supports_mi_query(stream->hwe)) { + /* If we don't find the context offset, just return error */ + ret = xe_oa_set_ctx_ctrl_offset(stream); + if (ret) { + drm_err(&stream->gt->tile->xe->drm, + "xe_oa_set_ctx_ctrl_offset failed for %s\n", + stream->hwe->name); + goto exit; + } + } + + // FIXME: do later, also put_render_ctx_id not needed, deleted, check + ret = xe_oa_get_render_ctx_id(stream); if (ret) { - drm_err(&stream->gt->tile->xe->drm, - "xe_oa_set_ctx_ctrl_offset failed for %s\n", - stream->hwe->name); + drm_dbg(&oa->xe->drm, "Invalid context id to filter with\n"); goto exit; } } + ret = xe_oa_alloc_noa_wait(stream); // FIXME: do later + if (ret) { + drm_dbg(&oa->xe->drm, "Unable to allocate NOA wait batch buffer\n"); + goto exit; + } + stream->oa_config = xe_oa_get_oa_config(oa, props->metrics_set); if (!stream->oa_config) { drm_dbg(&oa->xe->drm, "Invalid OA config id=%i\n", props->metrics_set); ret = -EINVAL; - goto exit; + goto err_free_noa_wait; } ret = xe_oa_alloc_oa_buffer(stream); @@ -1358,10 +1992,30 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, xe_device_mem_access_get(stream->oa->xe); XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + /* FIXME: Do later if needed (DG2 not POR for xe) + * + * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes + * OA to lose the configuration state. Prevent this by overriding GUCRC + * mode. + */ + if (xe_device_guc_submission_enabled(oa->xe) && + (IS_SUBPLATFORM_STEP(oa->xe, XE_DG2, XE_SUBPLATFORM_DG2_G10, STEP_A0, STEP_C0) || + IS_SUBPLATFORM_STEP(oa->xe, XE_DG2, XE_SUBPLATFORM_DG2_G11, STEP_A0, STEP_B0))) { + ret = intel_guc_slpc_override_gucrc_mode(gt, 0); // FIXME + if (ret) { + drm_dbg(&oa->xe->drm, "Unable to override gucrc mode\n"); + goto err_fw_put; + } + + stream->override_gucrc = true; + } + + // stream->engine->gt->perf.sseu = props->sseu; // FIXME + ret = xe_oa_enable_metric_set(stream); if (ret) { drm_dbg(&oa->xe->drm, "Unable to enable metric set\n"); - goto err_fw_put; + goto err_unset_gucrc; } drm_dbg(&oa->xe->drm, "opening stream oa config uuid=%s\n", @@ -1378,17 +2032,24 @@ static int xe_oa_stream_init(struct xe_oa_stream *stream, return 0; -err_fw_put: +err_unset_gucrc: xe_oa_disable_metric_set(stream); + if (stream->override_gucrc) + intel_guc_slpc_unset_gucrc_mode(gt); +err_fw_put: XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); xe_device_mem_access_put(stream->oa->xe); xe_oa_free_oa_buffer(stream); err_free_configs: xe_oa_free_configs(stream); +err_free_noa_wait: + xe_oa_free_noa_wait(stream); exit: return ret; } +__UNUSED__ void xe_oa_init_reg_state(void) {} + static int xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, struct drm_xe_oa_open_param *param, @@ -1403,6 +2064,9 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, int stream_fd; int ret; + // FIXME: check: some checks and initialization has been moved + // between stream_open_ioctl_locked, xe_oa_stream_init and read_properties + if (props->single_exec_q) { q = xe_exec_queue_lookup(xef, props->exec_q_id); if (XE_IOCTL_DBG(oa->xe, !q)) { @@ -1419,6 +2083,17 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, if (q && !props->sample) privileged_op = false; + if (props->hold_preemption) { + if (!q) { + drm_dbg(&oa->xe->drm, "preemption disable with no engine\n"); + ret = -EINVAL; + goto err_exec_q; + } + privileged_op = true; + } + + // get_default_sseu_config(&props->sseu, props->engine); // FIXME + if (privileged_op && xe_oa_stream_paranoid && !perfmon_capable()) { drm_dbg(&oa->xe->drm, "Insufficient privileges to open xe perf stream\n"); ret = -EACCES; @@ -1444,6 +2119,7 @@ xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, goto err_exec_q; } + stream->xef = xef; stream->oa = oa; stream->exec_q = q; @@ -1496,7 +2172,7 @@ u32 xe_oa_timestamp_frequency(struct xe_device *xe) switch (xe->info.platform) { case XE_DG2: case XE_METEORLAKE: - xe_device_mem_access_get(xe); + xe_device_mem_access_get(xe); // FIXME: check XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); reg = xe_mmio_read32(xe_root_mmio_gt(xe), RPM_CONFIG0); XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); @@ -1508,6 +2184,9 @@ u32 xe_oa_timestamp_frequency(struct xe_device *xe) default: return xe_root_mmio_gt(xe)->info.clock_freq; } + + // FIXME: should this be per gt, even in i915? + } static u64 oa_exponent_to_ns(struct xe_oa *oa, int exponent) @@ -1617,6 +2296,17 @@ static int xe_oa_read_properties_unlocked(struct xe_oa *oa, u64 __user *uprops, props->oa_periodic = true; props->oa_period_exponent = value; break; + case DRM_XE_OA_PROP_HOLD_PREEMPTION: + // FIXME: do this later + props->hold_preemption = value; + break; + case DRM_XE_OA_PROP_GLOBAL_SSEU: + /* + * FIXME: Confirm this, on i915 supportd only for < 12.5 + * xe_oa_open_properties.has_sseu is removed (always false) + */ + drm_dbg(&oa->xe->drm, "SSEU config not supported\n"); + return -ENODEV; case DRM_XE_OA_PROP_POLL_OA_PERIOD: if (value < 100000 /* 100us */) { drm_dbg(&oa->xe->drm, "OA timer too small (%lluns < 100us)\n", @@ -1664,6 +2354,19 @@ static int xe_oa_read_properties_unlocked(struct xe_oa *oa, u64 __user *uprops, return -EINVAL; } +#if 0 // FIXME: Do this later + /* + * Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media + * C6 disable in BIOS. Fail if Media C6 is enabled on steppings where OAM + * does not work as expected. + */ + if (IS_MTL_MEDIA_STEP(props->engine->xe, STEP_A0, STEP_C0) && + props->engine->oa_group->type == TYPE_OAM && + intel_check_bios_c6_setup(&props->engine->gt->rc6)) { + drm_dbg(&oa->xe->drm, "OAM requires media C6 to be disabled in BIOS\n"); + return -EINVAL; + } +#endif f = &oa->oa_formats[props->oa_format]; if (!props->oa_format || !f->size || !engine_supports_oa_format(props->hwe, f->type)) { @@ -1768,6 +2471,7 @@ static const struct xe_mmio_range mtl_oam_b_counters[] = { {} }; +/* FIXME: Checks below have been simplified/loosened for now compared with i915 */ static bool xe_oa_is_valid_b_counter_addr(struct xe_oa *oa, u32 addr) { return xe_oa_reg_in_range_table(addr, xehp_oa_b_counters) || @@ -2281,6 +2985,27 @@ int xe_oa_init(struct xe_device *xe) mutex_init(&oa->metrics_lock); idr_init_base(&oa->metrics_idr, 1); + /* We set up some ratelimit state to potentially throttle any + * _NOTES about spurious, invalid OA reports which we don't + * forward to userspace. + * + * We print a _NOTE about any throttling when closing the + * stream instead of waiting until driver _fini which no one + * would ever see. + * + * Using the same limiting factors as printk_ratelimit() + */ + ratelimit_state_init(&oa->spurious_report_rs, 5 * HZ, 10); + /* Since we use a DRM_NOTE for spurious reports it would be + * inconsistent to let __ratelimit() automatically print a + * warning for throttling. + */ + ratelimit_set_flags(&oa->spurious_report_rs, RATELIMIT_MSG_ON_RELEASE); + ratelimit_state_init(&oa->tail_pointer_race, 5 * HZ, 10); + ratelimit_set_flags(&oa->tail_pointer_race,RATELIMIT_MSG_ON_RELEASE); + + atomic64_set(&oa->noa_programming_delay, 500 * 1000 /* 500us */); + ret = xe_oa_init_engine_groups(oa); if (ret) { drm_err(&xe->drm, "OA initialization failed %d\n", ret); diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h index 41a7d8b0f10e1..cb716ffc4f7c9 100644 --- a/drivers/gpu/drm/xe/xe_oa.h +++ b/drivers/gpu/drm/xe/xe_oa.h @@ -8,6 +8,7 @@ #include "xe_oa_types.h" +/* Below __UNUSED__ refers to exported oa functions not called from other parts of xe */ int xe_oa_init(struct xe_device *xe); void xe_oa_fini(struct xe_device *xe); void xe_oa_register(struct xe_device *xe); @@ -22,6 +23,12 @@ int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, struct drm_file *file); int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, struct drm_file *file); +void xe_oa_init_reg_state(void); // __UNUSED__ + +// struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set); // __UNUSED__ +// struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config); // __UNUSED__ +// void xe_oa_config_put(struct xe_oa_config *oa_config); // __UNUSED__ + u32 xe_oa_timestamp_frequency(struct xe_device *xe); u32 xe_oa_unit_id(struct xe_hw_engine *hwe); diff --git a/drivers/gpu/drm/xe/xe_oa_types.h b/drivers/gpu/drm/xe/xe_oa_types.h index 58164ff0b6a48..382566c85f75e 100644 --- a/drivers/gpu/drm/xe/xe_oa_types.h +++ b/drivers/gpu/drm/xe/xe_oa_types.h @@ -110,6 +110,12 @@ struct xe_oa_gt { /** @lock: lock associated with anything below within this structure */ struct mutex lock; + /** FIXME + * @sseu: sseu configuration selected to run while perf is active, + * applies to all contexts. + */ + // struct intel_sseu sseu; + /** @num_oa_groups: number of oa groups per gt */ u32 num_oa_groups; @@ -138,9 +144,23 @@ struct xe_oa { */ struct idr metrics_idr; + /** + * @spurious_report_rs: For rate limiting notifications of spurious + * invalid OA reports + */ + struct ratelimit_state spurious_report_rs; + + /** @tail_pointer_race: For rate limiting notifications of tail pointer race */ + struct ratelimit_state tail_pointer_race; + /** @ctx_oactxctrl_offset: offset of OACTXCONTROL register in context image */ u32 ctx_oactxctrl_offset; + // u32 gen7_latched_oastatus1; // FIXME + // u32 ctx_flexeu0_offset; + // u32 gen8_valid_ctx_bit; // FIXME: deleted + // struct i915_oa_ops ops; // FIXME: these are deleted + /** @oa_formats: tracks all OA formats across platforms */ const struct xe_oa_format *oa_formats; @@ -149,6 +169,9 @@ struct xe_oa { /** @format_mask: tracks valid OA formats for a platform */ unsigned long format_mask[FORMAT_MASK_SIZE]; + /** @noa_programming_delay: NOA wait programmed delay value */ + atomic64_t noa_programming_delay; + /** @oa_unit_ids: tracks oa unit ids assigned across gt's */ u32 oa_unit_ids; }; @@ -157,12 +180,17 @@ struct xe_oa { * struct xe_oa_stream - state for a single open stream FD */ struct xe_oa_stream { + // FIXME: xef doesn't work, see __xe_oa_configure_all_contexts + /** @xef: xe_file associated with oa stream */ + struct xe_file *xef; + /** @oa: xe_oa backpointer */ struct xe_oa *oa; /** @gt: gt associated with the oa stream */ struct xe_gt *gt; + // FIXME: struct xe_hw_engine instead of intel_engine_cs /** * @hwe: hardware engine associated with this performance stream. */ @@ -185,6 +213,7 @@ struct xe_oa_stream { */ int sample_size; + /* FIXME: struct xe_engine instead of i915_gem_context */ /** * @exec_q: %NULL if measuring system-wide across all exec_q's or a * specific exec_q that is being monitored. @@ -198,6 +227,21 @@ struct xe_oa_stream { */ bool enabled; + /** + * @hold_preemption: Whether preemption is put on hold for command + * submissions done on the @engine. This is useful for some drivers that + * cannot easily post process the OA buffer context to subtract delta + * of performance counters not associated with @engine. + */ + bool hold_preemption; + + // FIXME: these are deleted + /** + * @ops: The callbacks providing the implementation of this specific + * type of configured stream. + */ + // const struct xe_perf_stream_ops *ops; + /** @oa_config: The OA configuration used by the stream */ struct xe_oa_config *oa_config; @@ -207,6 +251,12 @@ struct xe_oa_stream { */ struct llist_head oa_config_bos; + // FIXME: not needed for xe, should be 'struct xe_lrc *' if needed + /** + * @pinned_ctx: The OA context specific information. + */ + // struct intel_context *pinned_ctx; + /** @specific_ctx_id: id of the context used for filtering reports */ u32 specific_ctx_id; @@ -249,6 +299,9 @@ struct xe_oa_stream { /** @last_ctx_id: last context id for OA data added */ u32 last_ctx_id; + // struct xe_vma *vma; + // int size_exponent; + /** * @ptr_lock: Locks reads and writes to all head/tail state * @@ -286,10 +339,22 @@ struct xe_oa_stream { u32 tail; } oa_buffer; + /** + * @noa_wait: A batch buffer doing a wait on the GPU for the NOA + * logic to be reprogrammed. + */ + struct xe_bo *noa_wait; + /** * @poll_oa_period: The period in nanoseconds at which the OA * buffer should be checked for available data. */ u64 poll_oa_period; + + /** + * @override_gucrc: GuC RC has been overridden for the perf stream, + * and we need to restore the default configuration on release. + */ + bool override_gucrc; }; #endif diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index b4ab07c285245..ec2af7244e1de 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1146,6 +1146,8 @@ enum drm_xe_oa_property_id { */ DRM_XE_OA_PROP_POLL_OA_PERIOD, + /* FIXME: Should the OA unit be identified by OA unit id? In that case + what happens when mulitple engines are connected to an OA unit? */ /** * Multiple engines may be mapped to the same OA unit. The OA unit is * identified by class:instance of any engine mapped to it.