From patchwork Wed Jul 5 18:03:38 2023 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [v2] drm/xe: initial changes for XE OA From: Ashutosh Dixit X-Patchwork-Id: 545675 Message-Id: <20230705180338.1528851-1-ashutosh.dixit@intel.com> To: intel-gfx-trybot@lists.freedesktop.org Date: Wed, 5 Jul 2023 11:03:38 -0700 Somewhat functional but need cleanup v2: - Include oa_unit_id in uapi - Disable unlanded reports debug message Signed-off-by: Ashutosh Dixit --- drivers/gpu/drm/i915/i915_perf_oa_regs.h | 4 +- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/regs/xe_engine_regs.h | 5 + drivers/gpu/drm/xe/regs/xe_gpu_commands.h | 18 + drivers/gpu/drm/xe/regs/xe_oa_regs.h | 174 ++ drivers/gpu/drm/xe/xe_device.c | 16 + drivers/gpu/drm/xe/xe_device_types.h | 4 + drivers/gpu/drm/xe/xe_gt_types.h | 3 + drivers/gpu/drm/xe/xe_hw_engine_types.h | 2 + drivers/gpu/drm/xe/xe_module.c | 5 + drivers/gpu/drm/xe/xe_oa.c | 3429 +++++++++++++++++++++ drivers/gpu/drm/xe/xe_oa.h | 402 +++ drivers/gpu/drm/xe/xe_query.c | 7 +- include/uapi/drm/xe_drm.h | 285 +- 14 files changed, 4351 insertions(+), 4 deletions(-) create mode 100644 drivers/gpu/drm/xe/regs/xe_oa_regs.h create mode 100644 drivers/gpu/drm/xe/xe_oa.c create mode 100644 drivers/gpu/drm/xe/xe_oa.h diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h index e5ac7a8b5cb6e..e775871f570e5 100644 --- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h +++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h @@ -45,8 +45,8 @@ #define GEN8_OACTXCONTROL _MMIO(0x2360) #define GEN8_OA_TIMER_PERIOD_MASK 0x3F #define GEN8_OA_TIMER_PERIOD_SHIFT 2 -#define GEN8_OA_TIMER_ENABLE (1 << 1) -#define GEN8_OA_COUNTER_RESUME (1 << 0) +#define GEN8_OA_TIMER_ENABLE BIT(1) +#define GEN8_OA_COUNTER_RESUME BIT(0) #define GEN7_OABUFFER _MMIO(0x23B0) /* R/W */ #define GEN7_OABUFFER_OVERRUN_DISABLE (1 << 3) diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index 336f0eb8f91ef..1be30cfbc2878 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -82,6 +82,7 @@ xe-y += xe_bb.o \ xe_mmio.o \ xe_mocs.o \ xe_module.o \ + xe_oa.o \ xe_pat.o \ xe_pci.o \ xe_pcode.o \ diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 79873bf64e8dd..044a4920f1568 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -84,6 +84,9 @@ #define RING_CTX_TIMESTAMP(base) XE_REG((base) + 0x3a8) +#define MI_PREDICATE_RESULT_2(base) XE_REG((base) + 0x3bc) +#define MI_PREDICATE_RESULT_1(base) XE_REG((base) + 0x41c) + #define RING_FORCE_TO_NONPRIV(base, i) XE_REG(((base) + 0x4d0) + (i) * 4) #define RING_FORCE_TO_NONPRIV_DENY REG_BIT(30) #define RING_FORCE_TO_NONPRIV_ACCESS_MASK REG_GENMASK(29, 28) @@ -108,6 +111,8 @@ #define RING_EXECLIST_CONTROL(base) XE_REG((base) + 0x550) #define EL_CTRL_LOAD REG_BIT(0) +#define GEN8_RING_CS_GPR(base, n) XE_REG((base) + 0x600 + (n) * 8) + #define VDBOX_CGCTL3F10(base) XE_REG((base) + 0x3f10) #define IECPUNIT_CLKGATE_DIS REG_BIT(22) diff --git a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h index 12120dd37aa2a..632f96af38ec2 100644 --- a/drivers/gpu/drm/xe/regs/xe_gpu_commands.h +++ b/drivers/gpu/drm/xe/regs/xe_gpu_commands.h @@ -16,6 +16,7 @@ (__INSTR(INSTR_MI_CLIENT) | (opcode) << 23 | (flags)) #define MI_NOOP MI_INSTR(0, 0) +#define MI_SET_PREDICATE MI_INSTR(0x01, 0) #define MI_USER_INTERRUPT MI_INSTR(0x02, 0) #define MI_ARB_ON_OFF MI_INSTR(0x08, 0) @@ -23,6 +24,20 @@ #define MI_ARB_DISABLE (0<<0) #define MI_BATCH_BUFFER_END MI_INSTR(0x0a, 0) + +#define MI_MATH(x) MI_INSTR(0x1a, (x) - 1) +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2)) +#define MI_MATH_LOAD(op1, op2) MI_MATH_INSTR(0x080, op1, op2) +#define MI_MATH_ADD MI_MATH_INSTR(0x100, 0x0, 0x0) +#define MI_MATH_SUB MI_MATH_INSTR(0x101, 0x0, 0x0) +#define MI_MATH_STORE(op1, op2) MI_MATH_INSTR(0x180, op1, op2) +#define MI_MATH_STOREINV(op1, op2) MI_MATH_INSTR(0x580, op1, op2) +#define MI_MATH_REG(x) (x) +#define MI_MATH_REG_SRCA 0x20 +#define MI_MATH_REG_SRCB 0x21 +#define MI_MATH_REG_ACCU 0x31 +#define MI_MATH_REG_CF 0x33 + #define MI_STORE_DATA_IMM MI_INSTR(0x20, 0) #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) @@ -37,7 +52,10 @@ #define MI_FLUSH_DW_OP_STOREDW (1<<14) #define MI_FLUSH_DW_USE_GTT (1<<2) +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) + #define MI_BATCH_BUFFER_START MI_INSTR(0x31, 1) +#define MI_BATCH_PREDICATE REG_BIT(15) /* HSW+ on RCS only*/ #define XY_CTRL_SURF_COPY_BLT ((2 << 29) | (0x48 << 22) | 3) #define SRC_ACCESS_TYPE_SHIFT 21 diff --git a/drivers/gpu/drm/xe/regs/xe_oa_regs.h b/drivers/gpu/drm/xe/regs/xe_oa_regs.h new file mode 100644 index 0000000000000..4eafb1038b03f --- /dev/null +++ b/drivers/gpu/drm/xe/regs/xe_oa_regs.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2022 Intel Corporation + */ + +#ifndef __XE_OA_REGS__ +#define __XE_OA_REGS__ + +#define PERF_REG XE_REG + +#define REG_EQUAL(reg, xe_reg) ((reg) == (xe_reg.addr)) +#define REG_EQUAL_MCR(reg, xe_reg) ((reg) == (xe_reg.__reg.addr)) + +#define HALF_SLICE_CHICKEN2 XE_REG_MCR(0xe180) +#define GEN8_ST_PO_DISABLE BIT(13) + +#define GEN7_ROW_CHICKEN2 XE_REG(0xe4f4) +#define GEN8_ROW_CHICKEN XE_REG_MCR(0xe4f0) +#define STALL_DOP_GATING_DISABLE REG_BIT(5) +#define GEN12_DISABLE_DOP_GATING REG_BIT(0) + +#define RPM_CONFIG1 XE_REG(0xd04) +#define GEN10_GT_NOA_ENABLE REG_BIT(9) + +#define WAIT_FOR_RC6_EXIT XE_REG(0x20cc) +#define HSW_WAIT_FOR_RC6_EXIT_ENABLE BIT(0) + +#define EU_PERF_CNTL0 PERF_REG(0xe458) +#define EU_PERF_CNTL4 PERF_REG(0xe45c) +#define EU_PERF_CNTL1 PERF_REG(0xe558) +#define EU_PERF_CNTL5 PERF_REG(0xe55c) +#define EU_PERF_CNTL2 PERF_REG(0xe658) +#define EU_PERF_CNTL6 PERF_REG(0xe65c) +#define EU_PERF_CNTL3 PERF_REG(0xe758) + +#define OABUFFER_SIZE_128K (0 << 3) +#define OABUFFER_SIZE_256K (1 << 3) +#define OABUFFER_SIZE_512K (2 << 3) +#define OABUFFER_SIZE_1M (3 << 3) +#define OABUFFER_SIZE_2M (4 << 3) +#define OABUFFER_SIZE_4M (5 << 3) +#define OABUFFER_SIZE_8M (6 << 3) +#define OABUFFER_SIZE_16M (7 << 3) + +#define GEN12_OA_TLB_INV_CR XE_REG(0xceec) + +/* Gen12 OAR unit */ +#define GEN12_OAR_OACONTROL XE_REG(0x2960) +#define GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT 1 +#define GEN12_OAR_OACONTROL_COUNTER_ENABLE (1 << 0) + +#define GEN8_OACTXCONTROL XE_REG(0x2360) +#define GEN8_OA_COUNTER_RESUME (1 << 0) + +#define GEN12_OACTXCONTROL(base) XE_REG((base) + 0x360) +#define GEN12_OAR_OASTATUS XE_REG(0x2968) + +/* Gen12 OAG unit */ +#define GEN12_OAG_OAHEADPTR XE_REG(0xdb00) +#define GEN12_OAG_OAHEADPTR_MASK 0xffffffc0 +#define GEN12_OAG_OATAILPTR XE_REG(0xdb04) +#define GEN12_OAG_OATAILPTR_MASK 0xffffffc0 + +#define GEN12_OAG_OABUFFER XE_REG(0xdb08) +#define GEN12_OAG_OABUFFER_BUFFER_SIZE_MASK (0x7) +#define GEN12_OAG_OABUFFER_BUFFER_SIZE_SHIFT (3) +#define GEN12_OAG_OABUFFER_MEMORY_SELECT (1 << 0) /* 0: PPGTT, 1: GGTT */ + +#define GEN12_OAG_OAGLBCTXCTRL XE_REG(0x2b28) +#define GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT 2 +#define GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE (1 << 1) +#define GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME (1 << 0) + +#define GEN12_OAG_OACONTROL XE_REG(0xdaf4) +#define GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT 2 +#define GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE (1 << 0) + +#define GEN12_OAG_OA_DEBUG XE_REG(0xdaf8) +#define GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO (1 << 6) +#define GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS (1 << 5) +#define GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS (1 << 2) +#define GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1) + +#define GEN12_OAG_OASTATUS XE_REG(0xdafc) +#define GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2) +#define GEN12_OAG_OASTATUS_BUFFER_OVERFLOW (1 << 1) +#define GEN12_OAG_OASTATUS_REPORT_LOST (1 << 0) + +#define GDT_CHICKEN_BITS XE_REG(0x9840) +#define GT_NOA_ENABLE 0x00000080 + +#define GEN12_SQCNT1 XE_REG(0x8718) +#define GEN12_SQCNT1_PMON_ENABLE REG_BIT(30) +#define GEN12_SQCNT1_OABPC REG_BIT(29) + +/* Gen12 OAM unit */ +#define GEN12_OAM_HEAD_POINTER_OFFSET (0x1a0) +#define GEN12_OAM_HEAD_POINTER_MASK 0xffffffc0 + +#define GEN12_OAM_TAIL_POINTER_OFFSET (0x1a4) +#define GEN12_OAM_TAIL_POINTER_MASK 0xffffffc0 + +#define GEN12_OAM_BUFFER_OFFSET (0x1a8) +#define GEN12_OAM_BUFFER_SIZE_MASK (0x7) +#define GEN12_OAM_BUFFER_SIZE_SHIFT (3) +#define GEN12_OAM_BUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */ + +#define GEN12_OAM_CONTEXT_CONTROL_OFFSET (0x1bc) +#define GEN12_OAM_CONTEXT_CONTROL_TIMER_PERIOD_SHIFT 2 +#define GEN12_OAM_CONTEXT_CONTROL_TIMER_ENABLE REG_BIT(1) +#define GEN12_OAM_CONTEXT_CONTROL_COUNTER_RESUME REG_BIT(0) + +#define GEN12_OAM_CONTROL_OFFSET (0x194) +#define GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT 1 +#define GEN12_OAM_CONTROL_COUNTER_ENABLE REG_BIT(0) + +#define GEN12_OAM_DEBUG_OFFSET (0x198) +#define GEN12_OAM_DEBUG_BUFFER_SIZE_SELECT REG_BIT(12) +#define GEN12_OAM_DEBUG_INCLUDE_CLK_RATIO REG_BIT(6) +#define GEN12_OAM_DEBUG_DISABLE_CLK_RATIO_REPORTS REG_BIT(5) +#define GEN12_OAM_DEBUG_DISABLE_GO_1_0_REPORTS REG_BIT(2) +#define GEN12_OAM_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1) + +#define GEN12_OAM_STATUS_OFFSET (0x19c) +#define GEN12_OAM_STATUS_COUNTER_OVERFLOW REG_BIT(2) +#define GEN12_OAM_STATUS_BUFFER_OVERFLOW REG_BIT(1) +#define GEN12_OAM_STATUS_REPORT_LOST REG_BIT(0) + +#define GEN12_OAM_MMIO_TRG_OFFSET (0x1d0) + +#define GEN12_OAM_MMIO_TRG(base) \ + XE_REG((base) + GEN12_OAM_MMIO_TRG_OFFSET) + +#define GEN12_OAM_HEAD_POINTER(base) \ + XE_REG((base) + GEN12_OAM_HEAD_POINTER_OFFSET) +#define GEN12_OAM_TAIL_POINTER(base) \ + XE_REG((base) + GEN12_OAM_TAIL_POINTER_OFFSET) +#define GEN12_OAM_BUFFER(base) \ + XE_REG((base) + GEN12_OAM_BUFFER_OFFSET) +#define GEN12_OAM_CONTEXT_CONTROL(base) \ + XE_REG((base) + GEN12_OAM_CONTEXT_CONTROL_OFFSET) +#define GEN12_OAM_CONTROL(base) \ + XE_REG((base) + GEN12_OAM_CONTROL_OFFSET) +#define GEN12_OAM_DEBUG(base) \ + XE_REG((base) + GEN12_OAM_DEBUG_OFFSET) +#define GEN12_OAM_STATUS(base) \ + XE_REG((base) + GEN12_OAM_STATUS_OFFSET) + +#define GEN12_OAM_CEC0_0_OFFSET (0x40) +#define GEN12_OAM_CEC7_1_OFFSET (0x7c) +#define GEN12_OAM_CEC0_0(base) \ + XE_REG((base) + GEN12_OAM_CEC0_0_OFFSET) +#define GEN12_OAM_CEC7_1(base) \ + XE_REG((base) + GEN12_OAM_CEC7_1_OFFSET) + +#define GEN12_OAM_STARTTRIG1_OFFSET (0x00) +#define GEN12_OAM_STARTTRIG8_OFFSET (0x1c) +#define GEN12_OAM_STARTTRIG1(base) \ + XE_REG((base) + GEN12_OAM_STARTTRIG1_OFFSET) +#define GEN12_OAM_STARTTRIG8(base) \ + XE_REG((base) + GEN12_OAM_STARTTRIG8_OFFSET) + +#define GEN12_OAM_REPORTTRIG1_OFFSET (0x20) +#define GEN12_OAM_REPORTTRIG8_OFFSET (0x3c) +#define GEN12_OAM_REPORTTRIG1(base) \ + XE_REG((base) + GEN12_OAM_REPORTTRIG1_OFFSET) +#define GEN12_OAM_REPORTTRIG8(base) \ + XE_REG((base) + GEN12_OAM_REPORTTRIG8_OFFSET) + +#define GEN12_OAM_PERF_COUNTER_B0_OFFSET (0x84) +#define GEN12_OAM_PERF_COUNTER_B(base, idx) \ + XE_REG((base) + GEN12_OAM_PERF_COUNTER_B0_OFFSET + 4 * (idx)) + +#endif /* __XE_OA_REGS__ */ diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 07ae208af809d..1368cc26e1604 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -25,6 +25,7 @@ #include "xe_irq.h" #include "xe_mmio.h" #include "xe_module.h" +#include "xe_oa.h" #include "xe_pcode.h" #include "xe_pm.h" #include "xe_query.h" @@ -107,6 +108,11 @@ static const struct drm_ioctl_desc xe_ioctls[] = { DRM_IOCTL_DEF_DRV(XE_WAIT_USER_FENCE, xe_wait_user_fence_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(XE_VM_MADVISE, xe_vm_madvise_ioctl, DRM_RENDER_ALLOW), + + DRM_IOCTL_DEF_DRV(XE_OA_OPEN, xe_oa_stream_open_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_OA_ADD_CONFIG, xe_oa_add_config_ioctl, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(XE_OA_REMOVE_CONFIG, xe_oa_remove_config_ioctl, DRM_RENDER_ALLOW), + }; static const struct file_operations xe_driver_fops = { @@ -317,6 +323,10 @@ int xe_device_probe(struct xe_device *xe) goto err_irq_shutdown; } + err = xe_oa_init(xe); + if (err) + goto err_irq_shutdown; + err = xe_display_init(xe); if (err) goto err_fini_display; @@ -327,6 +337,8 @@ int xe_device_probe(struct xe_device *xe) xe_display_register(xe); + xe_oa_register(xe); + xe_debugfs_register(xe); err = drmm_add_action_or_reset(&xe->drm, xe_device_sanitize, xe); @@ -355,10 +367,14 @@ static void xe_device_remove_display(struct xe_device *xe) void xe_device_remove(struct xe_device *xe) { + xe_oa_unregister(xe); + xe_device_remove_display(xe); xe_display_unlink(xe); + xe_oa_fini(xe); + xe_irq_shutdown(xe); } diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 26a8de77138a8..6bcbfbd69b877 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -16,6 +16,7 @@ #include "xe_gt_types.h" #include "xe_platform_types.h" #include "xe_step_types.h" +#include "xe_oa.h" #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) #include "ext/intel_device_info.h" @@ -350,6 +351,9 @@ struct xe_device { /** @d3cold_allowed: Indicates if d3cold is a valid device state */ bool d3cold_allowed; + /** @oa: oa perf counter subsystem */ + struct xe_oa oa; + /* private: */ #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY) diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 7d4de019f9a5e..0626f993023b9 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -13,6 +13,7 @@ #include "xe_reg_sr_types.h" #include "xe_sa_types.h" #include "xe_uc_types.h" +#include "xe_oa.h" struct xe_engine_ops; struct xe_migrate; @@ -338,6 +339,8 @@ struct xe_gt { /** @oob: bitmap with active OOB workaroudns */ unsigned long *oob; } wa_active; + + struct xe_oa_gt oa; }; #endif diff --git a/drivers/gpu/drm/xe/xe_hw_engine_types.h b/drivers/gpu/drm/xe/xe_hw_engine_types.h index d788e67312b99..b2f3b5e5583ed 100644 --- a/drivers/gpu/drm/xe/xe_hw_engine_types.h +++ b/drivers/gpu/drm/xe/xe_hw_engine_types.h @@ -107,6 +107,8 @@ struct xe_hw_engine { void (*irq_handler)(struct xe_hw_engine *, u16); /** @engine_id: id for this hw engine */ enum xe_hw_engine_id engine_id; + /** @oa_group: oa unit for this hw engine */ + struct xe_oa_group *oa_group; }; /** diff --git a/drivers/gpu/drm/xe/xe_module.c b/drivers/gpu/drm/xe/xe_module.c index 75e5be939f530..5b5f6c4ea9022 100644 --- a/drivers/gpu/drm/xe/xe_module.c +++ b/drivers/gpu/drm/xe/xe_module.c @@ -11,6 +11,7 @@ #include "xe_drv.h" #include "xe_hw_fence.h" #include "xe_module.h" +#include "xe_oa.h" #include "xe_pci.h" #include "xe_sched_job.h" @@ -53,6 +54,10 @@ static const struct init_funcs init_funcs[] = { .init = xe_register_pci_driver, .exit = xe_unregister_pci_driver, }, + { + .init = xe_oa_sysctl_register, + .exit = xe_oa_sysctl_unregister, + }, }; static int __init xe_init(void) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c new file mode 100644 index 0000000000000..94437a012d249 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -0,0 +1,3429 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2022 Intel Corporation + */ + +/* + * Current list of features missing in xe kmd: + * - get_default_sseu_config + * - xe_engine_set_nopreempt + */ + +#include +#include +#include +#include + +#include +#include + +#include "regs/xe_engine_regs.h" +#include "regs/xe_gpu_commands.h" +#include "regs/xe_gt_regs.h" +#include "regs/xe_lrc_layout.h" +#include "regs/xe_oa_regs.h" +#include "regs/xe_regs.h" +#include "xe_bb.h" +#include "xe_bo.h" +#include "xe_device.h" +#include "xe_engine.h" +#include "xe_gt.h" +#include "xe_gt_mcr.h" +#include "xe_lrc.h" +#include "xe_migrate.h" +#include "xe_mmio.h" +#include "xe_oa.h" +#include "xe_pm.h" +#include "xe_sched_job.h" +#include "xe_vm.h" + +#define __UNUSED__ __attribute__((unused)) + +#define OA_BUFFER_SIZE SZ_16M + +#define OA_TAKEN(tail, head) ((tail - head) & (OA_BUFFER_SIZE - 1)) + +#define OA_TAIL_MARGIN_NSEC 100000ULL +#define INVALID_TAIL_PTR 0xffffffff + +#define DEFAULT_POLL_FREQUENCY_HZ 200 +#define DEFAULT_POLL_PERIOD_NS (NSEC_PER_SEC / DEFAULT_POLL_FREQUENCY_HZ) + +static u32 xe_oa_stream_paranoid = true; + +#define OA_EXPONENT_MAX 31 +#define INVALID_CTX_ID 0xffffffff + +/* On Gen8+ automatically triggered OA reports include a 'reason' field... */ +#define OAREPORT_REASON_MASK 0x3f +#define OAREPORT_REASON_MASK_EXTENDED 0x7f +#define OAREPORT_REASON_SHIFT 19 +#define OAREPORT_REASON_TIMER (1<<0) +#define OAREPORT_REASON_CTX_SWITCH (1<<3) +#define OAREPORT_REASON_CLK_RATIO (1<<5) + +#define HAS_MI_SET_PREDICATE(xe) (GRAPHICS_VERx100(xe) >= 1270) + +#define GEN11_SW_CTX_ID_SHIFT 37 +#define GEN11_SW_CTX_ID_WIDTH 11 +#define XEHP_SW_CTX_ID_SHIFT 39 +#define XEHP_SW_CTX_ID_WIDTH 16 +#define XEHP_SW_COUNTER_SHIFT 58 +#define XEHP_SW_COUNTER_WIDTH 6 +#define GEN12_GUC_SW_CTX_ID_SHIFT 39 +#define GEN12_GUC_SW_CTX_ID_WIDTH 16 +#define MAX_CONTEXT_HW_ID (1 << 21) /* exclusive */ +#define GEN11_MAX_CONTEXT_HW_ID (1 << 11) /* exclusive */ +/* in Gen12 ID 0x7FF is reserved to indicate idle */ +#define GEN12_MAX_CONTEXT_HW_ID (GEN11_MAX_CONTEXT_HW_ID - 1) +/* in Xe_HP ID 0xFFFF is reserved to indicate "invalid context" */ +#define XEHP_MAX_CONTEXT_HW_ID 0xFFFF + +#define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1) +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) +#define MI_SRM_LRM_GLOBAL_GTT BIT(22) +#define MI_OPCODE(x) (((x) >> 23) & 0x3f) +#define IS_MI_LRI_CMD(x) (MI_OPCODE(x) == MI_OPCODE(MI_INSTR(0x22, 0))) +#define MI_LRI_LEN(x) (((x) & 0xff) + 1) +#define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2) +#define MI_USE_GGTT MI_SRM_LRM_GLOBAL_GTT + +#define GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE REG_BIT(8) +#define GEN8_R_PWR_CLK_STATE(base) XE_REG((base) + 0xc8) +#define CTX_R_PWR_CLK_STATE (0x42 + 1) + +static int oa_sample_rate_hard_limit; +static u32 xe_oa_max_sample_rate = 100000; + +struct flex { + struct xe_reg reg; + u32 offset; + u32 value; +}; + +static const struct xe_oa_format oa_formats[] = { + [XE_OA_FORMAT_C4_B8] = { 7, 64 }, + [XE_OA_FORMAT_A12] = { 0, 64 }, + [XE_OA_FORMAT_A12_B8_C8] = { 2, 128 }, + [XE_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [XE_OAR_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [XE_OA_FORMAT_A24u40_A14u32_B8_C8] = { 5, 256 }, + [XE_OAM_FORMAT_MPEC8u64_B8_C8] = { 1, 192, TYPE_OAM, HDR_64_BIT }, + [XE_OAM_FORMAT_MPEC8u32_B8_C8] = { 2, 128, TYPE_OAM, HDR_64_BIT }, +}; + +static const u32 mtl_oa_base[] = { + [OA_GROUP_OAM_SAMEDIA_0] = 0x393000, +}; + +#define SAMPLE_OA_REPORT BIT(0) + +struct perf_open_properties { + u32 sample_flags; + + u64 single_context:1; // FIXME: rename to single_engine? + u64 hold_preemption:1; + u64 ctx_handle; // FIXME: rename to engine_id? + + /* OA sampling state */ + int metrics_set; + int oa_format; + bool oa_periodic; + int oa_period_exponent; + + // struct intel_sseu sseu; // FIXME: support in xe kmd? + + struct xe_hw_engine *hwe; + + u64 poll_oa_period; +}; + +struct xe_oa_config_bo { + struct llist_node node; + + struct xe_oa_config *oa_config; + struct xe_bb *bb; // FIXME: check +}; + +static struct ctl_table_header *sysctl_header; + +void xe_oa_config_release(struct kref *ref) +{ + struct xe_oa_config *oa_config = + container_of(ref, typeof(*oa_config), ref); + + kfree(oa_config->flex_regs); + kfree(oa_config->b_counter_regs); + kfree(oa_config->mux_regs); + + kfree_rcu(oa_config, rcu); +} + +void xe_oa_config_put(struct xe_oa_config *oa_config) +{ + if (!oa_config) + return; + + kref_put(&oa_config->ref, xe_oa_config_release); +} + +struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config) +{ + if (kref_get_unless_zero(&oa_config->ref)) + return oa_config; + else + return NULL; +} + +struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set) +{ + struct xe_oa_config *oa_config; + + rcu_read_lock(); + oa_config = idr_find(&oa->metrics_idr, metrics_set); + if (oa_config) + oa_config = xe_oa_config_get(oa_config); + rcu_read_unlock(); + + return oa_config; +} + +static void free_oa_config_bo(struct xe_oa_config_bo *oa_bo) +{ + xe_oa_config_put(oa_bo->oa_config); + xe_bb_free(oa_bo->bb, NULL); + kfree(oa_bo); +} + +static const struct xe_oa_regs *__oa_regs(struct xe_oa_stream *stream) +{ + return &stream->hwe->oa_group->regs; +} + +static u32 gen12_oa_hw_tail_read(struct xe_oa_stream *stream) +{ + return xe_mmio_read32(stream->gt, __oa_regs(stream)->oa_tail_ptr) & + GEN12_OAG_OATAILPTR_MASK; +} + +#define oa_report_header_64bit(__s) \ + ((__s)->oa_buffer.format->header == HDR_64_BIT) + +static u64 oa_report_id(struct xe_oa_stream *stream, void *report) +{ + return oa_report_header_64bit(stream) ? *(u64 *)report : *(u32 *)report; +} + +static u64 oa_report_reason(struct xe_oa_stream *stream, void *report) +{ + return (oa_report_id(stream, report) >> OAREPORT_REASON_SHIFT) & + (GRAPHICS_VER(stream->oa->xe) >= 12 ? + OAREPORT_REASON_MASK_EXTENDED : + OAREPORT_REASON_MASK); +} + +static void oa_report_id_clear(struct xe_oa_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + *(u64 *)report = 0; + else + *report = 0; +} + +static bool oa_report_ctx_invalid(struct xe_oa_stream *stream, void *report) +{ + return false; +} + +static u64 oa_timestamp(struct xe_oa_stream *stream, void *report) +{ + return oa_report_header_64bit(stream) ? + *((u64 *)report + 1) : + *((u32 *)report + 1); +} + +static void oa_timestamp_clear(struct xe_oa_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + *(u64 *)&report[2] = 0; + else + report[1] = 0; +} + +static u32 oa_context_id(struct xe_oa_stream *stream, u32 *report) +{ + u32 ctx_id = oa_report_header_64bit(stream) ? report[4] : report[2]; + + return ctx_id & stream->specific_ctx_id_mask; +} + +static void oa_context_id_squash(struct xe_oa_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + report[4] = INVALID_CTX_ID; + else + report[2] = INVALID_CTX_ID; +} + +static bool oa_buffer_check_unlocked(struct xe_oa_stream *stream) +{ + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + int report_size = stream->oa_buffer.format->size; + u32 head, tail, read_tail; + unsigned long flags; + bool pollin; + u32 hw_tail; + u64 now; + u32 partial_report_size; + + /* We have to consider the (unlikely) possibility that read() errors + * could result in an OA buffer reset which might reset the head and + * tail state. + */ + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + hw_tail = gen12_oa_hw_tail_read(stream); + + /* The tail pointer increases in 64 byte increments, not in report_size + * steps. Also the report size may not be a power of 2. Compute + * potentially partially landed report in the OA buffer + */ + partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail); + partial_report_size %= report_size; + + /* Subtract partial amount off the tail */ + hw_tail = OA_TAKEN(hw_tail, partial_report_size); + + now = ktime_get_mono_fast_ns(); + + /* NB: The head we observe here might effectively be a little + * out of date. If a read() is in progress, the head could be + * anywhere between this head and stream->oa_buffer.tail. + */ + head = stream->oa_buffer.head - gtt_offset; + read_tail = stream->oa_buffer.tail - gtt_offset; + + tail = hw_tail; + + /* Walk the stream backward until we find a report with report + * id and timestmap not at 0. Since the circular buffer pointers + * progress by increments of 64 bytes and that reports can be up + * to 256 bytes long, we can't tell whether a report has fully + * landed in memory before the report id and timestamp of the + * following report have effectively landed. + * + * This is assuming that the writes of the OA unit land in + * memory in the order they were written to. + * If not : (╯°□°)╯︵ ┻━┻ + */ + while (OA_TAKEN(tail, read_tail) >= report_size) { + void *report = stream->oa_buffer.vaddr + tail; + + if (oa_report_id(stream, report) || + oa_timestamp(stream, report)) + break; + + tail = (tail - report_size) & (OA_BUFFER_SIZE - 1); + } +#if 0 // FIXME + if (OA_TAKEN(hw_tail, tail) > report_size && + __ratelimit(&stream->oa->tail_pointer_race)) + drm_dbg(&stream->oa->xe->drm, + "unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n", + head, tail, hw_tail); +#endif + stream->oa_buffer.tail = gtt_offset + tail; + + pollin = OA_TAKEN(stream->oa_buffer.tail, + stream->oa_buffer.head) >= report_size; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + + return pollin; +} + +static int append_oa_status(struct xe_oa_stream *stream, + char __user *buf, + size_t count, + size_t *offset, + enum drm_xe_oa_record_type type) +{ + struct drm_xe_oa_record_header header = { type, 0, sizeof(header) }; + + if ((count - *offset) < header.size) + return -ENOSPC; + + if (copy_to_user(buf + *offset, &header, sizeof(header))) + return -EFAULT; + + (*offset) += header.size; + + return 0; +} + +static int append_oa_sample(struct xe_oa_stream *stream, + char __user *buf, + size_t count, + size_t *offset, + const u8 *report) +{ + int report_size = stream->oa_buffer.format->size; + struct drm_xe_oa_record_header header; + int report_size_partial; + u8 *oa_buf_end; + + header.type = DRM_XE_OA_RECORD_SAMPLE; + header.pad = 0; + header.size = stream->sample_size; + + if ((count - *offset) < header.size) + return -ENOSPC; + + buf += *offset; + if (copy_to_user(buf, &header, sizeof(header))) + return -EFAULT; + buf += sizeof(header); + + oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; + report_size_partial = oa_buf_end - report; + + if (report_size_partial < report_size) { + if (copy_to_user(buf, report, report_size_partial)) + return -EFAULT; + buf += report_size_partial; + + if (copy_to_user(buf, stream->oa_buffer.vaddr, + report_size - report_size_partial)) + return -EFAULT; + } else if (copy_to_user(buf, report, report_size)) { + return -EFAULT; + } + + (*offset) += header.size; + + return 0; +} + +static int gen8_append_oa_reports(struct xe_oa_stream *stream, + char __user *buf, + size_t count, + size_t *offset) +{ + int report_size = stream->oa_buffer.format->size; + u8 *oa_buf_base = stream->oa_buffer.vaddr; + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + u32 mask = (OA_BUFFER_SIZE - 1); + size_t start_offset = *offset; + unsigned long flags; + u32 head, tail; + int ret = 0; + + if (drm_WARN_ON(&stream->gt->tile->xe->drm, !stream->enabled)) + return -EIO; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + head = stream->oa_buffer.head; + tail = stream->oa_buffer.tail; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + + /* + * NB: oa_buffer.head/tail include the gtt_offset which we don't want + * while indexing relative to oa_buf_base. + */ + head -= gtt_offset; + tail -= gtt_offset; + + /* + * An out of bounds or misaligned head or tail pointer implies a driver + * bug since we validate + align the tail pointers we read from the + * hardware and we are in full control of the head pointer which should + * only be incremented by multiples of the report size. + */ + if (drm_WARN_ONCE(&stream->gt->tile->xe->drm, + head > OA_BUFFER_SIZE || + tail > OA_BUFFER_SIZE, + "Inconsistent OA buffer pointers: head = %u, tail = %u\n", + head, tail)) + return -EIO; + + + for (/* none */; + OA_TAKEN(tail, head); + head = (head + report_size) & mask) { + u8 *report = oa_buf_base + head; + u32 *report32 = (void *)report; + u32 ctx_id; + u64 reason; + + /* + * The reason field includes flags identifying what + * triggered this specific report (mostly timer + * triggered or e.g. due to a context switch). + * + * In MMIO triggered reports, some platforms do not set the + * reason bit in this field and it is valid to have a reason + * field of zero. + */ + reason = oa_report_reason(stream, report); + ctx_id = oa_context_id(stream, report32); + + /* + * Squash whatever is in the CTX_ID field if it's marked as + * invalid to be sure we avoid false-positive, single-context + * filtering below... + * + * Note: that we don't clear the valid_ctx_bit so userspace can + * understand that the ID has been squashed by the kernel. + */ + if (oa_report_ctx_invalid(stream, report)) { + ctx_id = INVALID_CTX_ID; + oa_context_id_squash(stream, report32); + } + + /* + * NB: For Gen 8 the OA unit no longer supports clock gating + * off for a specific context and the kernel can't securely + * stop the counters from updating as system-wide / global + * values. + * + * Automatic reports now include a context ID so reports can be + * filtered on the cpu but it's not worth trying to + * automatically subtract/hide counter progress for other + * contexts while filtering since we can't stop userspace + * issuing MI_REPORT_PERF_COUNT commands which would still + * provide a side-band view of the real values. + * + * To allow userspace (such as Mesa/GL_INTEL_performance_query) + * to normalize counters for a single filtered context then it + * needs be forwarded bookend context-switch reports so that it + * can track switches in between MI_REPORT_PERF_COUNT commands + * and can itself subtract/ignore the progress of counters + * associated with other contexts. Note that the hardware + * automatically triggers reports when switching to a new + * context which are tagged with the ID of the newly active + * context. To avoid the complexity (and likely fragility) of + * reading ahead while parsing reports to try and minimize + * forwarding redundant context switch reports (i.e. between + * other, unrelated contexts) we simply elect to forward them + * all. + * + * We don't rely solely on the reason field to identify context + * switches since it's not-uncommon for periodic samples to + * identify a switch before any 'context switch' report. + */ + if (!stream->engine || // FIXME: check + stream->specific_ctx_id == ctx_id || + stream->oa_buffer.last_ctx_id == stream->specific_ctx_id || + reason & OAREPORT_REASON_CTX_SWITCH) { + + /* + * While filtering for a single context we avoid + * leaking the IDs of other contexts. + */ + if (stream->engine && // FIXME: check + stream->specific_ctx_id != ctx_id) { + oa_context_id_squash(stream, report32); + } + + ret = append_oa_sample(stream, buf, count, offset, + report); + if (ret) + break; + + stream->oa_buffer.last_ctx_id = ctx_id; + } + + if (is_power_of_2(report_size)) { + /* + * Clear out the report id and timestamp as a means + * to detect unlanded reports. + */ + oa_report_id_clear(stream, report32); + oa_timestamp_clear(stream, report32); + } else { + /* Zero out the entire report */ + memset(report32, 0, report_size); + } + } + + if (start_offset != *offset) { + struct xe_reg oaheadptr = __oa_regs(stream)->oa_head_ptr; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + /* + * We removed the gtt_offset for the copy loop above, indexing + * relative to oa_buf_base so put back here... + */ + head += gtt_offset; + xe_mmio_write32(stream->gt, oaheadptr, + head & GEN12_OAG_OAHEADPTR_MASK); + stream->oa_buffer.head = head; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + } + + return ret; +} + +static void gen12_init_oa_buffer(struct xe_oa_stream *stream) +{ + u32 gtt_offset = xe_bo_ggtt_addr(stream->oa_buffer.bo); + unsigned long flags; + + spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); + + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_status, 0); + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_head_ptr, + gtt_offset & GEN12_OAG_OAHEADPTR_MASK); + stream->oa_buffer.head = gtt_offset; + + /* + * PRM says: + * + * "This MMIO must be set before the OATAILPTR + * register and after the OAHEADPTR register. This is + * to enable proper functionality of the overflow + * bit." + */ + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_buffer, gtt_offset | + OABUFFER_SIZE_16M | GEN12_OAG_OABUFFER_MEMORY_SELECT); + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_tail_ptr, + gtt_offset & GEN12_OAG_OATAILPTR_MASK); + + /* Mark that we need updated tail pointers to read from... */ + stream->oa_buffer.tail = gtt_offset; + + /* + * Reset state used to recognise context switches, affecting which + * reports we will forward to userspace while filtering for a single + * context. + */ + stream->oa_buffer.last_ctx_id = INVALID_CTX_ID; + + spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); + + /* + * NB: although the OA buffer will initially be allocated + * zeroed via shmfs (and so this memset is redundant when + * first allocating), we may re-init the OA buffer, either + * when re-enabling a stream or in error/reset paths. + * + * The reason we clear the buffer for each re-init is for the + * sanity check in gen8_append_oa_reports() that looks at the + * reason field to make sure it's non-zero which relies on + * the assumption that new reports are being written to zeroed + * memory... + */ + memset(stream->oa_buffer.vaddr, 0, stream->oa_buffer.bo->size); +} + +static void gen12_oa_enable(struct xe_oa_stream *stream) +{ + const struct xe_oa_regs *regs; + u32 val; + + /* + * If we don't want OA reports from the OA buffer, then we don't + * even need to program the OAG unit. + */ + if (!(stream->sample_flags & SAMPLE_OA_REPORT)) + return; + + gen12_init_oa_buffer(stream); + + regs = __oa_regs(stream); + val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) | + GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE; + + xe_mmio_write32(stream->gt, regs->oa_ctrl, val); +} + +static void gen12_oa_disable(struct xe_oa_stream *stream) +{ + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_ctrl, 0); + if (xe_mmio_wait32(stream->gt, __oa_regs(stream)->oa_ctrl, 0, + GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 50000, NULL, false)) + drm_err(&stream->oa->xe->drm, + "wait for OA to be disabled timed out\n"); + + xe_mmio_write32(stream->gt, GEN12_OA_TLB_INV_CR, 1); + if (xe_mmio_wait32(stream->gt, GEN12_OA_TLB_INV_CR, 0, 1, 50000, NULL, false)) + drm_err(&stream->oa->xe->drm, + "wait for OA tlb invalidate timed out\n"); +} + +static int gen8_oa_read(struct xe_oa_stream *stream, + char __user *buf, + size_t count, + size_t *offset) +{ + struct xe_reg oastatus_reg = __oa_regs(stream)->oa_status; + u32 oastatus; + int ret; + + if (drm_WARN_ON(&stream->gt->tile->xe->drm, !stream->oa_buffer.vaddr)) + return -EIO; + + oastatus = xe_mmio_read32(stream->gt, oastatus_reg); + + /* + * We treat OABUFFER_OVERFLOW as a significant error: + * + * Although theoretically we could handle this more gracefully + * sometimes, some Gens don't correctly suppress certain + * automatically triggered reports in this condition and so we + * have to assume that old reports are now being trampled + * over. + * + * Considering how we don't currently give userspace control + * over the OA buffer size and always configure a large 16MB + * buffer, then a buffer overflow does anyway likely indicate + * that something has gone quite badly wrong. + */ + if (oastatus & GEN12_OAG_OASTATUS_BUFFER_OVERFLOW) { + ret = append_oa_status(stream, buf, count, offset, + DRM_XE_OA_RECORD_OA_BUFFER_LOST); + if (ret) + return ret; + + drm_dbg(&stream->oa->xe->drm, + "OA buffer overflow (exponent = %d): force restart\n", + stream->period_exponent); + + gen12_oa_disable(stream); + gen12_oa_enable(stream); + + /* + * Note: .oa_enable() is expected to re-init the oabuffer and + * reset GEN8_OASTATUS for us + */ + oastatus = xe_mmio_read32(stream->gt, oastatus_reg); + } + + if (oastatus & GEN12_OAG_OASTATUS_REPORT_LOST) { + ret = append_oa_status(stream, buf, count, offset, + DRM_XE_OA_RECORD_OA_REPORT_LOST); + if (ret) + return ret; + + xe_mmio_rmw32(stream->gt, oastatus_reg, + GEN12_OAG_OASTATUS_COUNTER_OVERFLOW | + GEN12_OAG_OASTATUS_REPORT_LOST, 0); + } + + return gen8_append_oa_reports(stream, buf, count, offset); +} + +static int xe_oa_wait_unlocked(struct xe_oa_stream *stream) +{ + /* We would wait indefinitely if periodic sampling is not enabled */ + if (!stream->periodic) + return -EIO; + + return wait_event_interruptible(stream->poll_wq, + oa_buffer_check_unlocked(stream)); +} + +static void xe_oa_poll_wait(struct xe_oa_stream *stream, + struct file *file, + poll_table *wait) +{ + poll_wait(file, &stream->poll_wq, wait); +} + +static int __xe_oa_read(struct xe_oa_stream *stream, + char __user *buf, + size_t count, + size_t *offset) +{ + return gen8_oa_read(stream, buf, count, offset); +} + +#if 0 +// If this is needed need to look into further +static int oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) +{ + struct xe_hw_engine *hwe = stream->hwe; + struct xe_engine *e; + struct xe_sched_job *job; + struct dma_fence *fence; + struct xe_vm *vm; + u64 batch_ofs; + long timeout; + int err = 0; + + if (stream->engine) { + /* + * FIXME: can we send kernel bb in e->vm context? Seems to be + * causing big problems (cat err) which need to be investigated + */ + e = stream->engine; + XE_BUG_ON(!e->vm); + err = dma_resv_lock_interruptible(&e->vm->resv, NULL); + if (err) + goto exit; + down_write(&e->vm->lock); + job = xe_bb_create_job(e, bb); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto vm_unlock; + } + } else { + vm = xe_migrate_get_vm(stream->gt->tile->migrate); + e = xe_engine_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1, + hwe, ENGINE_FLAG_WA); + if (IS_ERR(e)) { + err = PTR_ERR(e); + drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_engine_create,e failed=%d", + stream->gt->info.id, hwe->name, err); + goto put_vm; + } + + batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo); + /* Will add MI_BATCH_BUFFER_END */ + job = xe_bb_create_wa_job(e, bb, batch_ofs); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto put_engine; + } + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + if (timeout < 0) + err = timeout; + else if (!timeout) + err = -ETIME; +put_engine: + if (!stream->engine) + xe_engine_put(e); +put_vm: + if (!stream->engine) + xe_vm_put(vm); +vm_unlock: + if (stream->engine) { + dma_resv_unlock(&e->vm->resv); + up_write(&e->vm->lock); + } +exit: + return err; +} +#endif + +/* + FIXME: Currently submits only to stream->engine or new engine for + stream->hwe. If needed, add 'struct xe_engine *' argument + + For now unconditionally create engine otherwise we hit BUG_ON in + xe_bb_create_wa_job. If jobs need to be sent to the same engine for + serialization may need to replace xe_bb_create_wa_job with a similar + function. + + Also the code is wrong for gen12_guc_sw_ctx_id because there we need to + submit against the real engine/context rather than the new engine created + below. +*/ +static int oa_submit_bb(struct xe_oa_stream *stream, struct xe_bb *bb) +{ + struct xe_hw_engine *hwe = stream->hwe; + struct xe_engine *e = stream->engine; + struct xe_sched_job *job; + struct dma_fence *fence; + struct xe_vm *vm; + u64 batch_ofs; + long timeout; + int err = 0; + + vm = xe_migrate_get_vm(stream->gt->tile->migrate); + // if (!stream->engine) { + e = xe_engine_create(hwe->gt->tile->xe, vm, BIT(hwe->logical_instance), 1, + hwe, ENGINE_FLAG_WA); + if (IS_ERR(e)) { + err = PTR_ERR(e); + drm_err(&stream->oa->xe->drm, "gt%d, hwe %s, xe_engine_create,e failed=%d", + stream->gt->info.id, hwe->name, err); + goto put_vm; + } + // } + + batch_ofs = xe_bo_ggtt_addr(stream->gt->tile->mem.kernel_bb_pool->bo); + /* Will add MI_BATCH_BUFFER_END */ + job = xe_bb_create_wa_job(e, bb, batch_ofs); + if (IS_ERR(job)) { + err = PTR_ERR(job); + goto put_engine; + } + + xe_sched_job_arm(job); + fence = dma_fence_get(&job->drm.s_fence->finished); + xe_sched_job_push(job); + + timeout = dma_fence_wait_timeout(fence, false, HZ); + dma_fence_put(fence); + if (timeout < 0) + err = timeout; + else if (!timeout) + err = -ETIME; + +put_engine: + // if (!stream->engine) + xe_engine_put(e); +put_vm: + xe_vm_put(vm); + + return err; +} + +static void oa_pin_context(struct xe_oa_stream *stream) +{ + // contexts are already pinned for now +} + +static void __store_reg_to_mem(struct xe_bb *bb, struct xe_reg reg, u32 ggtt_offset) +{ + u32 cmd; + + cmd = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + cmd++; + + bb->cs[bb->len++] = cmd; + bb->cs[bb->len++] = reg.addr; + bb->cs[bb->len++] = ggtt_offset; + bb->cs[bb->len++] = 0; +} + +static int __read_reg(struct xe_oa_stream *stream, struct xe_reg reg, u32 ggtt_offset) +{ + struct xe_bb *bb; + int err = 0; + + bb = xe_bb_new(stream->gt, 4 + 1, false); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto exit; + } + + __store_reg_to_mem(bb, reg, ggtt_offset); + + err = oa_submit_bb(stream, bb); + xe_bb_free(bb, NULL); +exit: + return err; +} + +static int gen12_guc_sw_ctx_id(struct xe_oa_stream *stream, u32 *ctx_id) +{ + struct xe_bo *bo; + u32 *ptr; + int err = 0; + + bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL, + 4096, ttm_bo_type_kernel, + // XE_BO_CREATE_VRAM_IF_DGFX(gt) | + XE_BO_CREATE_SYSTEM_BIT | // FIXME: check + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) { + err = PTR_ERR(bo); + goto exit; + } + + err = __read_reg(stream, RING_EXECLIST_STATUS_HI(stream->hwe->mmio_base), + xe_bo_ggtt_addr(bo)); + if (err) + goto unpin; + + ptr = bo->vmap.is_iomem ? bo->vmap.vaddr_iomem : bo->vmap.vaddr; + + *ctx_id = *ptr; +unpin: + xe_bo_unpin_map_no_vm(bo); +exit: + return err; +} + +static int gen12_get_render_context_id(struct xe_oa_stream *stream) +{ + u32 ctx_id, mask; + int ret; + + if (xe_device_guc_submission_enabled(stream->gt->tile->xe)) { + ret = gen12_guc_sw_ctx_id(stream, &ctx_id); + if (ret) + return ret; + + mask = ((1U << GEN12_GUC_SW_CTX_ID_WIDTH) - 1) << + (GEN12_GUC_SW_CTX_ID_SHIFT - 32); + } else if (GRAPHICS_VERx100(stream->gt->tile->xe) >= 1250) { + ctx_id = (XEHP_MAX_CONTEXT_HW_ID - 1) << + (XEHP_SW_CTX_ID_SHIFT - 32); + + mask = ((1U << XEHP_SW_CTX_ID_WIDTH) - 1) << + (XEHP_SW_CTX_ID_SHIFT - 32); + } else { + ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << + (GEN11_SW_CTX_ID_SHIFT - 32); + + mask = ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << + (GEN11_SW_CTX_ID_SHIFT - 32); + } + stream->specific_ctx_id = ctx_id & mask; + stream->specific_ctx_id_mask = mask; + + return 0; +} + +#define MI_LRI_LEN(x) (((x) & 0xff) + 1) + +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) +{ + u32 idx = *offset; + u32 len = min(MI_LRI_LEN(state[idx]) + idx, end); + bool found = false; + + idx++; + for (; idx < len; idx += 2) { + if (state[idx] == reg) { + found = true; + break; + } + } + + *offset = idx; + return found; +} + +static u32 oa_context_image_offset(struct xe_oa_stream *stream, u32 reg) +{ + // FIXME: check len and state assignments below + u32 len = (xe_lrc_size(stream->gt->tile->xe, stream->hwe->class) - PAGE_SIZE) / 4; + u32 *state = stream->gt->default_lrc[stream->hwe->class]; + u32 offset; + + /* + * FIXME: maybe ok but really __xe_lrc_regs_offset should be added to + * state. The same offset should be used in gen12_configure_oar_context + * where ctx_oactxctrl_offset is consumed. Also instead of default_lrc + * we could use stream->engine->lrc or stream->hwe->kernel_lrc + */ + if (drm_WARN_ON(&stream->oa->xe->drm, !state)) + return U32_MAX; + + for (offset = 0; offset < len; ) { + if (IS_MI_LRI_CMD(state[offset])) { + /* + * We expect reg-value pairs in MI_LRI command, so + * MI_LRI_LEN() should be even, if not, issue a warning. + */ + drm_WARN_ON(&stream->oa->xe->drm, + MI_LRI_LEN(state[offset]) & 0x1); + + if (oa_find_reg_in_lri(state, reg, &offset, len)) + break; + } else { + offset++; + } + } + + return offset < len ? offset : U32_MAX; +} + +static int set_oa_ctx_ctrl_offset(struct xe_oa_stream *stream) +{ + struct xe_reg reg = GEN12_OACTXCONTROL(stream->hwe->mmio_base); + u32 offset = stream->oa->ctx_oactxctrl_offset; + + /* Do this only once. Failure is stored as offset of U32_MAX */ + if (offset) + goto exit; + + offset = oa_context_image_offset(stream, reg.addr); + stream->oa->ctx_oactxctrl_offset = offset; + + drm_dbg(&stream->oa->xe->drm, + "%s oa ctx control at 0x%08x dword offset\n", + stream->hwe->name, offset); +exit: + return offset && offset != U32_MAX ? 0 : -ENODEV; +} + +static bool engine_supports_mi_query(struct xe_hw_engine *hwe) +{ + return hwe->class == XE_ENGINE_CLASS_RENDER; +} + +static int oa_get_render_ctx_id(struct xe_oa_stream *stream) +{ + int ret = 0; + + oa_pin_context(stream); + + if (engine_supports_mi_query(stream->hwe)) { + /* + * We are enabling perf query here. If we don't find the context + * offset here, just return an error. + */ + ret = set_oa_ctx_ctrl_offset(stream); + if (ret) { + drm_err(&stream->gt->tile->xe->drm, + "set_oa_ctx_ctrl_offset failed for %s\n", + stream->hwe->name); + return ret; + } + } + + ret = gen12_get_render_context_id(stream); + + drm_dbg(&stream->gt->tile->xe->drm, + "filtering on ctx_id=0x%x ctx_id_mask=0x%x\n", + stream->specific_ctx_id, + stream->specific_ctx_id_mask); + + return ret; +} + +static void oa_put_render_ctx_id(struct xe_oa_stream *stream) +{ + stream->specific_ctx_id = INVALID_CTX_ID; + stream->specific_ctx_id_mask = 0; +} + +static void free_oa_buffer(struct xe_oa_stream *stream) +{ + xe_bo_unpin_map_no_vm(stream->oa_buffer.bo); +} + +static void free_oa_configs(struct xe_oa_stream *stream) +{ + struct xe_oa_config_bo *oa_bo, *tmp; + + // FIXME: check functions below + xe_oa_config_put(stream->oa_config); + llist_for_each_entry_safe(oa_bo, tmp, stream->oa_config_bos.first, node) + free_oa_config_bo(oa_bo); +} + +static void free_noa_wait(struct xe_oa_stream *stream) +{ + xe_bo_unpin_map_no_vm(stream->noa_wait); +} + +static bool engine_supports_oa(const struct xe_hw_engine *hwe) +{ + return hwe->oa_group; +} + +static bool engine_supports_oa_format(const struct xe_hw_engine *hwe, int type) +{ + return hwe->oa_group && hwe->oa_group->type == type; +} + +static void gen8_store_flex(struct xe_oa_stream *stream, struct xe_lrc *lrc, + struct xe_bb *bb, const struct flex *flex, u32 count) +{ + u32 offset = xe_bo_ggtt_addr(lrc->bo); + + do { + bb->cs[bb->len++] = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + bb->cs[bb->len++] = offset + flex->offset * sizeof(u32); + bb->cs[bb->len++] = 0; + bb->cs[bb->len++] = flex->value; + + } while (flex++, --count); +} + +static int gen8_modify_context(struct xe_oa_stream *stream, struct xe_lrc *lrc, + const struct flex *flex, u32 count) +{ + struct xe_bb *bb; + int err = 0; + + bb = xe_bb_new(stream->gt, 4 * count + 1, false); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto exit; + } + + gen8_store_flex(stream, lrc, bb, flex, count); + + err = oa_submit_bb(stream, bb); + xe_bb_free(bb, NULL); +exit: + return err; +} + +static void gen8_load_flex(struct xe_oa_stream *stream, struct xe_bb *bb, + const struct flex *flex, u32 count) +{ + XE_BUG_ON(!count || count > 63); + + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM(count); + + do { + bb->cs[bb->len++] = flex->reg.addr; + bb->cs[bb->len++] = flex->value; + + } while (flex++, --count); + + bb->cs[bb->len++] = MI_NOOP; +} + +static int gen8_modify_self(struct xe_oa_stream *stream, + const struct flex *flex, u32 count) +{ + struct xe_bb *bb; + int err = 0; + + bb = xe_bb_new(stream->gt, 2 * count + 3, false); + if (IS_ERR(bb)) { + err = PTR_ERR(bb); + goto exit; + } + + gen8_load_flex(stream, bb, flex, count); + + err = oa_submit_bb(stream, bb); + xe_bb_free(bb, NULL); +exit: + return err; +} + +static int gen8_configure_context(struct xe_oa_stream *stream, + struct xe_engine *engine, + struct flex *flex, u32 count) +{ + int i, err = 0; + + for (i = 0; i < engine->width; i++) { + // flex->value = intel_sseu_make_rpcs(ce->engine->gt, &ce->sseu); // FIXME + err = gen8_modify_context(stream, &engine->lrc[i], flex, count); + if (err) + break; + } + + return err; +} + +static int oa_configure_all_contexts(struct xe_oa_stream *stream, + struct flex *regs, + size_t num_regs, bool enable) +{ + struct xe_file *xef = stream->xef; + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_engine *e; + unsigned long idx; + int err; + + // FIXME: below crashes during close, need to check xef mutex + return 0; + + // FIXME: check + mutex_lock(&xef->engine.lock); + xa_for_each(&xef->engine.xa, idx, e) { + xe_engine_get(e); + err = gen8_configure_context(stream, e, regs, num_regs); + xe_engine_put(e); + if (err) + return err; + } + mutex_unlock(&xef->engine.lock); + + /* + * After updating all other contexts, we need to modify ourselves. If + * we don't modify the kernel_context, we do not get events while idle. + */ + for_each_hw_engine(hwe, stream->gt, id) { + /* + * FIXME: at present there is no way to create an engine using + * hwe->kernel_lrc. Also in xe we don't use kernel_lrc when idle, + * though we would need a 'context' restored to get events when idle + * to make sure registers are programmed correctly. + */ + } + + return 0; +} + +static __UNUSED__ int +lrc_configure_all_contexts(struct xe_oa_stream *stream, + const struct xe_oa_config *oa_config) +{ + return 0; // FIXME: not used for gen12+ +} + +static int gen12_configure_all_contexts(struct xe_oa_stream *stream, bool enable) +{ + struct flex regs[] = { + { + GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE), + CTX_R_PWR_CLK_STATE, + }, + }; + + if (stream->hwe->class != XE_ENGINE_CLASS_RENDER) + return 0; + + // FIXME: what should this do when enable == false? + + return oa_configure_all_contexts(stream, regs, ARRAY_SIZE(regs), enable); +} + +static int gen12_configure_oar_context(struct xe_oa_stream *stream, bool enable) +{ + int err; + u32 format = stream->oa_buffer.format->format; + u32 offset = stream->oa->ctx_oactxctrl_offset; + struct flex regs_context[] = { + { + GEN8_OACTXCONTROL, + offset + 1, + enable ? GEN8_OA_COUNTER_RESUME : 0, + }, + }; + /* Offsets in regs_lri are not used since this configuration is only + * applied using LRI. Initialize the correct offsets for posterity. + */ +#define GEN12_OAR_OACONTROL_OFFSET 0x5B0 + struct flex regs_lri[] = { + { + GEN12_OAR_OACONTROL, + GEN12_OAR_OACONTROL_OFFSET + 1, + (format << GEN12_OAR_OACONTROL_COUNTER_FORMAT_SHIFT) | + (enable ? GEN12_OAR_OACONTROL_COUNTER_ENABLE : 0) + }, + { + RING_CONTEXT_CONTROL(stream->hwe->mmio_base), + CTX_CONTEXT_CONTROL, + _MASKED_FIELD(GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE, + enable ? + GEN12_CTX_CTRL_OAR_CONTEXT_ENABLE : + 0) + }, + }; + + /* + * Modify the context image of pinned context with regs_context + * FIXME: for now only modifying engine->lrc[0], but maybe this should + * be changed to modify all lrc's underlying the engine? + */ + err = gen8_modify_context(stream, &stream->engine->lrc[0], + regs_context, ARRAY_SIZE(regs_context)); + if (err) + return err; + + /* Apply regs_lri using LRI with pinned context */ + return gen8_modify_self(stream, regs_lri, ARRAY_SIZE(regs_lri)); +} + +bool HAS_OA_BPC_REPORTING(struct xe_device *xe) // FIXME +{ + switch (xe->info.platform) { + case XE_DG2: + case XE_PVC: + case XE_METEORLAKE: + return true; + default: + return false; + } +} + +static void gen12_disable_metric_set(struct xe_oa_stream *stream) +{ + u32 sqcnt1; + + /* + * Wa_1508761755:xehpsdv, dg2 + * Enable thread stall DOP gating and EU DOP gating. + */ + if (stream->gt->tile->xe->info.platform == XE_DG2) { + xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN, + _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); + xe_mmio_write32(stream->gt, GEN7_ROW_CHICKEN2, + _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); + } + + /* Reset all contexts' slices/subslices configurations. */ + gen12_configure_all_contexts(stream, false); + + /* disable the context save/restore or OAR counters */ + if (stream->engine) + gen12_configure_oar_context(stream, false); + + /* Make sure we disable noa to save power. */ + xe_mmio_rmw32(stream->gt, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0); + + sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | + (HAS_OA_BPC_REPORTING(stream->gt->tile->xe) ? GEN12_SQCNT1_OABPC : 0); + + /* Reset PMON Enable to save power. */ + xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, sqcnt1, 0); +} + +static int intel_guc_slpc_override_gucrc_mode(struct xe_gt *gt, u32 mode) +{ + return 0; // FIXME +} + +static int intel_guc_slpc_unset_gucrc_mode(struct xe_gt *gt) +{ + return 0; // FIXME +} + +void xe_oa_engine_pm_get(struct xe_oa_stream *stream) +{ + xe_device_mem_access_get(stream->oa->xe); // FIXME +} + +void xe_oa_engine_pm_put(struct xe_oa_stream *stream) +{ + xe_device_mem_access_put(stream->oa->xe); // FIXME +} + +static void xe_oa_stream_destroy(struct xe_oa_stream *stream) +{ + struct xe_oa_group *g = stream->hwe->oa_group; + struct xe_gt *gt = stream->hwe->gt; + struct xe_oa *oa = stream->oa; + + if (WARN_ON(stream != g->exclusive_stream)) + return; + + /* + * Unset exclusive_stream first, it will be checked while disabling + * the metric set on gen8+. + * + * See i915_oa_init_reg_state() and lrc_configure_all_contexts() + */ + WRITE_ONCE(g->exclusive_stream, NULL); + gen12_disable_metric_set(stream); + + free_oa_buffer(stream); + + /* + * Wa_16011777198:dg2: Unset the override of GUCRC mode to enable rc6. + */ + if (stream->override_gucrc) + drm_WARN_ON(>->tile->xe->drm, intel_guc_slpc_unset_gucrc_mode(gt)); + + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_oa_engine_pm_put(stream); + + if (stream->engine) + oa_put_render_ctx_id(stream); + + free_oa_configs(stream); + free_noa_wait(stream); + + if (oa->spurious_report_rs.missed) { + drm_notice(>->tile->xe->drm, + "%d spurious OA report notices suppressed due to ratelimiting\n", + oa->spurious_report_rs.missed); + } +} + +static int alloc_oa_buffer(struct xe_oa_stream *stream) +{ + struct xe_bo *bo; + + BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE); + BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M); + + bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL, + OA_BUFFER_SIZE, ttm_bo_type_kernel, + // XE_BO_CREATE_VRAM_IF_DGFX(gt) | + XE_BO_CREATE_SYSTEM_BIT | // FIXME: check + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + stream->oa_buffer.bo = bo; + stream->oa_buffer.vaddr = bo->vmap.is_iomem ? + bo->vmap.vaddr_iomem : bo->vmap.vaddr; + + return 0; +} + +static u32 *save_restore_register(struct xe_oa_stream *stream, u32 *cs, + bool save, struct xe_reg reg, u32 offset, + u32 dword_count) +{ + u32 cmd; + u32 d; + + cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM; + cmd |= MI_SRM_LRM_GLOBAL_GTT; + cmd++; + + for (d = 0; d < dword_count; d++) { + *cs++ = cmd; + *cs++ = reg.addr + 4 * d; + *cs++ = xe_bo_ggtt_addr(stream->noa_wait) + offset + 4 * d; + *cs++ = 0; + } + + return cs; +} + +static u64 div_u64_roundup(u64 nom, u32 den) +{ + return div_u64(nom + den - 1, den); +} + +static u64 intel_gt_ns_to_clock_interval(const struct xe_gt *gt, u64 ns) +{ + return div_u64_roundup(gt->info.clock_freq * ns, NSEC_PER_SEC); +} + +static int alloc_noa_wait(struct xe_oa_stream *stream) +{ + struct xe_bo *bo; + const u64 delay_ticks = 0xffffffffffffffff - + intel_gt_ns_to_clock_interval(stream->gt, + atomic64_read(&stream->oa->noa_programming_delay)); + const u32 base = stream->hwe->mmio_base; +#define CS_GPR(x) GEN8_RING_CS_GPR(base, x) + u32 *batch, *ts0, *cs, *jump; + int ret, i; + enum { + START_TS, + NOW_TS, + DELTA_TS, + JUMP_PREDICATE, + DELTA_TARGET, + N_CS_GPR + }; + struct xe_reg mi_predicate_result = HAS_MI_SET_PREDICATE(stream->gt->tile->xe) ? + MI_PREDICATE_RESULT_2(base) : + MI_PREDICATE_RESULT_1(RENDER_RING_BASE); + + bo = xe_bo_create_pin_map(stream->gt->tile->xe, stream->gt->tile, NULL, + 8192, ttm_bo_type_kernel, + // XE_BO_CREATE_VRAM_IF_DGFX(gt) | + XE_BO_CREATE_SYSTEM_BIT | // FIXME: check + XE_BO_CREATE_GGTT_BIT); + if (IS_ERR(bo)) + return PTR_ERR(bo); + + batch = cs = bo->vmap.is_iomem ? bo->vmap.vaddr_iomem : bo->vmap.vaddr; + stream->noa_wait = bo; + +#define GPR_SAVE_OFFSET 4096 +#define PREDICATE_SAVE_OFFSET 4160 + + /* Save registers. */ + for (i = 0; i < N_CS_GPR; i++) + cs = save_restore_register( + stream, cs, true /* save */, CS_GPR(i), + GPR_SAVE_OFFSET + 8 * i, 2); + cs = save_restore_register( + stream, cs, true /* save */, mi_predicate_result, + PREDICATE_SAVE_OFFSET, 1); + + /* First timestamp snapshot location. */ + ts0 = cs; + + /* + * Initial snapshot of the timestamp register to implement the wait. + * We work with 32b values, so clear out the top 32b bits of the + * register because the ALU works 64bits. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = CS_GPR(START_TS).addr + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = RING_TIMESTAMP(base).addr; + *cs++ = CS_GPR(START_TS).addr; + + /* + * This is the location we're going to jump back into until the + * required amount of time has passed. + */ + jump = cs; + + /* + * Take another snapshot of the timestamp register. Take care to clear + * up the top 32bits of CS_GPR(1) as we're using it for other + * operations below. + */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = CS_GPR(NOW_TS).addr + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = RING_TIMESTAMP(base).addr; + *cs++ = CS_GPR(NOW_TS).addr; + + /* + * Do a diff between the 2 timestamps and store the result back into + * CS_GPR(1). + */ + *cs++ = MI_MATH(5); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS)); + *cs++ = MI_MATH_SUB; + *cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU); + *cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF); + + /* + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the + * timestamp have rolled over the 32bits) into the predicate register + * to be used for the predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = CS_GPR(JUMP_PREDICATE).addr; + *cs++ = mi_predicate_result.addr; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE | 1; + + /* Restart from the beginning if we had timestamps roll over. */ + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_PREDICATE; + // *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; // FIXME + *cs++ = 0; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE; + + /* + * Now add the diff between to previous timestamps and add it to : + * (((1 * << 64) - 1) - delay_ns) + * + * When the Carry Flag contains 1 this means the elapsed time is + * longer than the expected delay, and we can exit the wait loop. + */ + *cs++ = MI_LOAD_REGISTER_IMM(2); + *cs++ = CS_GPR(DELTA_TARGET).addr; + *cs++ = lower_32_bits(delay_ticks); + *cs++ = CS_GPR(DELTA_TARGET).addr + 4; + *cs++ = upper_32_bits(delay_ticks); + + *cs++ = MI_MATH(4); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET)); + *cs++ = MI_MATH_ADD; + *cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF); + + *cs++ = MI_ARB_CHECK; + + /* + * Transfer the result into the predicate register to be used for the + * predicated jump. + */ + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); + *cs++ = CS_GPR(JUMP_PREDICATE).addr; + *cs++ = mi_predicate_result.addr; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE | 1; + + /* Predicate the jump. */ + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_PREDICATE; + // *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; // FIXME + *cs++ = 0; + + if (HAS_MI_SET_PREDICATE(stream->gt->tile->xe)) + *cs++ = MI_SET_PREDICATE; + + /* Restore registers. */ + for (i = 0; i < N_CS_GPR; i++) + cs = save_restore_register( + stream, cs, false /* restore */, CS_GPR(i), + GPR_SAVE_OFFSET + 8 * i, 2); + cs = save_restore_register( + stream, cs, false /* restore */, mi_predicate_result, + PREDICATE_SAVE_OFFSET, 1); + + /* And return to the ring. */ + *cs++ = MI_BATCH_BUFFER_END; + + return ret; +} + +void write_cs_mi_lri(struct xe_bb *bb, const struct xe_oa_reg *reg_data, u32 n_regs) +{ + u32 i; + +#define MI_LOAD_REGISTER_IMM_MAX_REGS (126) + + for (i = 0; i < n_regs; i++) { + if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) { + u32 n_lri = min_t(u32, + n_regs - i, + MI_LOAD_REGISTER_IMM_MAX_REGS); + + bb->cs[bb->len++] = MI_LOAD_REGISTER_IMM(n_lri); + } + bb->cs[bb->len++] = reg_data[i].addr.addr; + bb->cs[bb->len++] = reg_data[i].value; + } +} + +static int num_lri_dwords(int num_regs) +{ + int count = 0; + + if (num_regs > 0) { + count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS); + count += num_regs * 2; + } + + return count; +} + +static struct xe_oa_config_bo * +alloc_oa_config_buffer(struct xe_oa_stream *stream, struct xe_oa_config *oa_config) +{ + struct xe_oa_config_bo *oa_bo; + size_t config_length = 0; + struct xe_bb *bb; + + oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL); + if (!oa_bo) + return ERR_PTR(-ENOMEM); + + config_length += num_lri_dwords(oa_config->mux_regs_len); + config_length += num_lri_dwords(oa_config->b_counter_regs_len); + config_length += num_lri_dwords(oa_config->flex_regs_len); +#if 1 // FIXME: noa_wait (see 93937659dc64) + config_length++; /* MI_BATCH_BUFFER_END */ +#else + config_length += 4; /* MI_BATCH_BUFFER_START */ +#endif + config_length = ALIGN(sizeof(u32) * config_length, XE_PAGE_SIZE) / sizeof(u32); + + bb = xe_bb_new(stream->gt, config_length, false); + if (IS_ERR(bb)) + goto err_free; + + write_cs_mi_lri(bb, oa_config->mux_regs, oa_config->mux_regs_len); + write_cs_mi_lri(bb, oa_config->b_counter_regs, oa_config->b_counter_regs_len); + write_cs_mi_lri(bb, oa_config->flex_regs, oa_config->flex_regs_len); + +#if 0 // FIXME: noa_wait (see 93937659dc64) + // xe_bb_create_job adds MI_BATCH_BUFFER_END + // TBD: how to handle noa_wait in xe_bb_create_job + + /* Jump into the active wait. */ + bb->cs[bb->len++] = MI_BATCH_BUFFER_START; + bb->cs[bb->len++] = xe_bo_ggtt_addr(stream->noa_wait); + bb->cs[bb->len++] = 0; +#endif + oa_bo->bb = bb; + oa_bo->oa_config = xe_oa_config_get(oa_config); + llist_add(&oa_bo->node, &stream->oa_config_bos); + + return oa_bo; +err_free: + kfree(oa_bo); + return ERR_CAST(bb); +} + +static struct xe_oa_config_bo *get_oa_vma(struct xe_oa_stream *stream) +{ + struct xe_oa_config *oa_config = stream->oa_config; + struct xe_oa_config_bo *oa_bo; + + /* + * Look for the buffer in the already allocated BOs attached + * to the stream. + */ + llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) { + if (oa_bo->oa_config == oa_config && + memcmp(oa_bo->oa_config->uuid, + oa_config->uuid, + sizeof(oa_config->uuid)) == 0) + goto out; + } + + oa_bo = alloc_oa_config_buffer(stream, oa_config); +out: + return oa_bo; +} + +// FIXME: check entire function and called functions +static int emit_oa_config(struct xe_oa_stream *stream) +{ + struct xe_oa_config_bo *oa_bo; + int err = 0; + + oa_bo = get_oa_vma(stream); + if (IS_ERR(oa_bo)) { + err = PTR_ERR(oa_bo); + goto exit; + } + + err = oa_submit_bb(stream, oa_bo->bb); +exit: + return err; +} + +static __UNUSED__ void oa_context(struct xe_oa_stream *stream) {} + +static __UNUSED__ u32 oa_config_flex_reg(const struct xe_oa_config *oa_config, + struct xe_reg reg) +{ + u32 mmio = reg.addr; + int i; + + /* + * This arbitrary default will select the 'EU FPU0 Pipeline + * Active' event. In the future it's anticipated that there + * will be an explicit 'No Event' we can select, but not yet... + */ + if (!oa_config) + return 0; + + for (i = 0; i < oa_config->flex_regs_len; i++) { + if (oa_config->flex_regs[i].addr.addr == mmio) + return oa_config->flex_regs[i].value; + } + + return 0; +} + +static __UNUSED__ void gen8_update_reg_state_unlocked(const struct xe_oa_stream *stream) {} + +static __UNUSED__ u32 oag_report_ctx_switches(const struct xe_oa_stream *stream) +{ + return _MASKED_FIELD(GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS, + (stream->sample_flags & SAMPLE_OA_REPORT) ? + 0 : GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS); +} + +static int gen12_enable_metric_set(struct xe_oa_stream *stream) +{ + u32 sqcnt1; + int ret; + + /* + * Wa_1508761755:xehpsdv, dg2 + * EU NOA signals behave incorrectly if EU clock gating is enabled. + * Disable thread stall DOP gating and EU DOP gating. + */ + if (stream->gt->tile->xe->info.platform == XE_DG2) { + xe_gt_mcr_multicast_write(stream->gt, GEN8_ROW_CHICKEN, // FIXME: check + _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); + xe_mmio_write32(stream->gt, GEN7_ROW_CHICKEN2, + _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); + } + + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_debug, + /* Disable clk ratio reports, like previous Gens. */ + _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS | + GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) | + /* + * If the user didn't require OA reports, instruct + * the hardware not to emit ctx switch reports. + */ + oag_report_ctx_switches(stream)); + + xe_mmio_write32(stream->gt, __oa_regs(stream)->oa_ctx_ctrl, stream->periodic ? + (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME | + GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE | + (stream->period_exponent << + GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT)) : 0); + + /* + * Initialize Super Queue Internal Cnt Register + * Set PMON Enable in order to collect valid metrics. + * Enable byets per clock reporting in OA for XEHPSDV onward. + */ + sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | + (HAS_OA_BPC_REPORTING(stream->gt->tile->xe) ? GEN12_SQCNT1_OABPC : 0); + + xe_mmio_rmw32(stream->gt, GEN12_SQCNT1, 0, sqcnt1); + + /* + * Update all contexts prior writing the mux configurations as we need + * to make sure all slices/subslices are ON before writing to NOA + * registers. + */ + ret = gen12_configure_all_contexts(stream, true); + if (ret) + return ret; + + /* + * For Gen12, performance counters are context + * saved/restored. Only enable it for the context that + * requested this. + */ + if (stream->engine) { + ret = gen12_configure_oar_context(stream, true); + if (ret) + return ret; + } + + return emit_oa_config(stream); +} + +static void xe_oa_stream_enable(struct xe_oa_stream *stream) +{ + stream->pollin = false; + + gen12_oa_enable(stream); + + if (stream->sample_flags & SAMPLE_OA_REPORT) + hrtimer_start(&stream->poll_check_timer, + ns_to_ktime(stream->poll_oa_period), + HRTIMER_MODE_REL_PINNED); +} + +static void xe_oa_stream_disable(struct xe_oa_stream *stream) +{ + gen12_oa_disable(stream); + + if (stream->sample_flags & SAMPLE_OA_REPORT) + hrtimer_cancel(&stream->poll_check_timer); +} + +static int xe_oa_stream_enable_sync(struct xe_oa_stream *stream) +{ + return gen12_enable_metric_set(stream); +} + +static __UNUSED__ void get_default_sseu_config(void) {} +static __UNUSED__ void get_sseu_config(void) {} + +/* + * OA timestamp frequency = CS timestamp frequency in most platforms. On some + * platforms OA unit ignores the CTC_SHIFT and the 2 timestamps differ. In such + * cases, return the adjusted CS timestamp frequency to the user. + */ +u32 xe_oa_timestamp_frequency(struct xe_device *xe) +{ + struct xe_gt *gt = xe_root_mmio_gt(xe); + u32 reg, shift; + + /* + * Wa_18013179988:dg2 + * Wa_14015846243:mtl + */ + switch (xe->info.platform) { + case XE_DG2: + case XE_METEORLAKE: + xe_device_mem_access_get(xe); // FIXME: check + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + reg = xe_mmio_read32(xe_root_mmio_gt(xe), RPM_CONFIG0); + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_device_mem_access_put(xe); + + shift = REG_FIELD_GET(RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, reg); + return xe_root_mmio_gt(xe)->info.clock_freq << (3 - shift); + + default: + return xe_root_mmio_gt(xe)->info.clock_freq; + } + + // FIXME: should this be per gt, even in i915? + +} + +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) +{ + struct xe_oa_stream *stream = + container_of(hrtimer, typeof(*stream), poll_check_timer); + + if (oa_buffer_check_unlocked(stream)) { + stream->pollin = true; + wake_up(&stream->poll_wq); + } + + hrtimer_forward_now(hrtimer, + ns_to_ktime(stream->poll_oa_period)); + + return HRTIMER_RESTART; +} + +static int xe_oa_stream_init(struct xe_oa_stream *stream, + struct drm_xe_oa_open_param *param, + struct perf_open_properties *props) +{ + struct xe_oa_group *g = props->hwe->oa_group; + struct xe_gt *gt = props->hwe->gt; + struct xe_oa *oa = stream->oa; + int ret; + + /* FIXME: More checks here should be moved to read_properties_unlocked */ + /* Also cleanup 'struct xe_oa_stream' for duplictates */ + + /* + * If the sysfs metrics/ directory wasn't registered for some + * reason then don't let userspace try their luck with config IDs + */ + if (!oa->metrics_kobj) { + drm_dbg(&oa->xe->drm, "OA metrics weren't advertised via sysfs\n"); + return -EINVAL; + } + + if (!(props->sample_flags & SAMPLE_OA_REPORT) && + (GRAPHICS_VER(oa->xe) < 12 || !stream->engine)) { + drm_dbg(&oa->xe->drm, "Only OA report sampling supported\n"); + return -EINVAL; + } + + /* + * To avoid the complexity of having to accurately filter counter + * reports and marshal to the appropriate client we currently only + * allow exclusive access + */ + if (g->exclusive_stream) { + drm_dbg(&oa->xe->drm, "OA unit already in use\n"); + return -EBUSY; + } + + stream->hwe = props->hwe; + stream->gt = stream->hwe->gt; + + stream->sample_size = sizeof(struct drm_xe_oa_record_header); + + stream->oa_buffer.format = &oa->oa_formats[props->oa_format]; + if (drm_WARN_ON(&oa->xe->drm, stream->oa_buffer.format->size == 0)) + return -EINVAL; + + stream->sample_flags = props->sample_flags; + stream->sample_size += stream->oa_buffer.format->size; + + stream->hold_preemption = props->hold_preemption; + + stream->periodic = props->oa_periodic; + if (stream->periodic) + stream->period_exponent = props->oa_period_exponent; + + if (stream->engine) { + ret = oa_get_render_ctx_id(stream); + if (ret) { + drm_dbg(&oa->xe->drm, "Invalid context id to filter with\n"); + return ret; + } + } + + ret = alloc_noa_wait(stream); + if (ret) { + drm_dbg(&oa->xe->drm, "Unable to allocate NOA wait batch buffer\n"); + goto err_noa_wait_alloc; + } + + stream->oa_config = xe_oa_get_oa_config(oa, props->metrics_set); + if (!stream->oa_config) { + drm_dbg(&oa->xe->drm, "Invalid OA config id=%i\n", props->metrics_set); + ret = -EINVAL; + goto err_config; + } + + /* PRM - observability performance counters: + * + * OACONTROL, performance counter enable, note: + * + * "When this bit is set, in order to have coherent counts, + * RC6 power state and trunk clock gating must be disabled. + * This can be achieved by programming MMIO registers as + * 0xA094=0 and 0xA090[31]=1" + * + * In our case we are expecting that taking pm + FORCEWAKE + * references will effectively disable RC6. + */ + + xe_oa_engine_pm_get(stream); + XE_WARN_ON(xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + + /* + * Wa_16011777198:dg2: GuC resets render as part of the Wa. This causes + * OA to lose the configuration state. Prevent this by overriding GUCRC + * mode. + */ + if (xe_device_guc_submission_enabled(oa->xe) && + (IS_SUBPLATFORM_STEP(oa->xe, XE_DG2, XE_SUBPLATFORM_DG2_G10, STEP_A0, STEP_C0) || + IS_SUBPLATFORM_STEP(oa->xe, XE_DG2, XE_SUBPLATFORM_DG2_G11, STEP_A0, STEP_B0))) { + ret = intel_guc_slpc_override_gucrc_mode(gt, 0); // FIXME + if (ret) { + drm_dbg(&oa->xe->drm, "Unable to override gucrc mode\n"); + goto err_gucrc; + } + + stream->override_gucrc = true; + } + + ret = alloc_oa_buffer(stream); + if (ret) + goto err_oa_buf_alloc; + + // stream->engine->gt->perf.sseu = props->sseu; // FIXME + WRITE_ONCE(g->exclusive_stream, stream); + + ret = xe_oa_stream_enable_sync(stream); + if (ret) { + drm_dbg(&oa->xe->drm, "Unable to enable metric set\n"); + goto err_enable; + } + + drm_dbg(&oa->xe->drm, "opening stream oa config uuid=%s\n", + stream->oa_config->uuid); + + hrtimer_init(&stream->poll_check_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + stream->poll_check_timer.function = oa_poll_check_timer_cb; + init_waitqueue_head(&stream->poll_wq); + spin_lock_init(&stream->oa_buffer.ptr_lock); + mutex_init(&stream->lock); + + return 0; + +err_enable: + WRITE_ONCE(g->exclusive_stream, NULL); + gen12_disable_metric_set(stream); + + free_oa_buffer(stream); + +err_oa_buf_alloc: + if (stream->override_gucrc) + intel_guc_slpc_unset_gucrc_mode(gt); + +err_gucrc: + XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL)); + xe_oa_engine_pm_put(stream); + + free_oa_configs(stream); + +err_config: + free_noa_wait(stream); + +err_noa_wait_alloc: + if (stream->engine) + oa_put_render_ctx_id(stream); + + return ret; +} + +__UNUSED__ void xe_oa_init_reg_state(void) {} + +static ssize_t xe_oa_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct xe_oa_stream *stream = file->private_data; + size_t offset = 0; + int ret; + + /* To ensure it's handled consistently we simply treat all reads of a + * disabled stream as an error. In particular it might otherwise lead + * to a deadlock for blocking file descriptors... + */ + if (!stream->enabled || !(stream->sample_flags & SAMPLE_OA_REPORT)) + return -EIO; + + if (!(file->f_flags & O_NONBLOCK)) { + /* There's the small chance of false positives from + * stream->ops->wait_unlocked. + * + * E.g. with single context filtering since we only wait until + * oabuffer has >= 1 report we don't immediately know whether + * any reports really belong to the current context + */ + do { + ret = xe_oa_wait_unlocked(stream); + if (ret) + return ret; + + mutex_lock(&stream->lock); + ret = __xe_oa_read(stream, buf, count, &offset); + mutex_unlock(&stream->lock); + } while (!offset && !ret); + } else { + mutex_lock(&stream->lock); + ret = __xe_oa_read(stream, buf, count, &offset); + mutex_unlock(&stream->lock); + } + + /* We allow the poll checking to sometimes report false positive EPOLLIN + * events where we might actually report EAGAIN on read() if there's + * not really any data available. In this situation though we don't + * want to enter a busy loop between poll() reporting a EPOLLIN event + * and read() returning -EAGAIN. Clearing the oa.pollin state here + * effectively ensures we back off until the next hrtimer callback + * before reporting another EPOLLIN event. + * The exception to this is if ops->read() returned -ENOSPC which means + * that more OA data is available than could fit in the user provided + * buffer. In this case we want the next poll() call to not block. + */ + if (ret != -ENOSPC) + stream->pollin = false; + + /* Possible values for ret are 0, -EFAULT, -ENOSPC, -EIO, ... */ + return offset ?: (ret ?: -EAGAIN); +} + +static __poll_t xe_oa_poll_locked(struct xe_oa_stream *stream, + struct file *file, poll_table *wait) +{ + __poll_t events = 0; + + xe_oa_poll_wait(stream, file, wait); + + /* Note: we don't explicitly check whether there's something to read + * here since this path may be very hot depending on what else + * userspace is polling, or on the timeout in use. We rely solely on + * the hrtimer/oa_poll_check_timer_cb to notify us when there are + * samples to read. + */ + if (stream->pollin) + events |= EPOLLIN; + + return events; +} + +static __poll_t xe_oa_poll(struct file *file, poll_table *wait) +{ + struct xe_oa_stream *stream = file->private_data; + __poll_t ret; + + mutex_lock(&stream->lock); + ret = xe_oa_poll_locked(stream, file, wait); + mutex_unlock(&stream->lock); + + return ret; +} + +static void xe_engine_set_nopreempt(struct xe_engine *engine) +{ + // FIXME +} + +static void xe_engine_clear_nopreempt(struct xe_engine *engine) +{ + // FIXME +} + +static void xe_oa_enable_locked(struct xe_oa_stream *stream) +{ + if (stream->enabled) + return; + + /* Allow stream->ops->enable() to refer to this */ + stream->enabled = true; + + xe_oa_stream_enable(stream); + + if (stream->hold_preemption) + xe_engine_set_nopreempt(stream->engine); +} + +static void xe_oa_disable_locked(struct xe_oa_stream *stream) +{ + if (!stream->enabled) + return; + + /* Allow stream->ops->disable() to refer to this */ + stream->enabled = false; + + if (stream->hold_preemption) + xe_engine_clear_nopreempt(stream->engine); + + xe_oa_stream_disable(stream); +} + +static long xe_oa_config_locked(struct xe_oa_stream *stream, + unsigned long metrics_set) +{ + struct xe_oa_config *config; + long ret = stream->oa_config->id; + + config = xe_oa_get_oa_config(stream->oa, metrics_set); + if (!config) + return -EINVAL; + + if (config != stream->oa_config) { + int err; + + /* + * If OA is bound to a specific context, emit the + * reconfiguration inline from that context. The update + * will then be ordered with respect to submission on that + * context. + * + * When set globally, we use a low priority kernel context, + * so it will effectively take effect when idle. + */ + // err = emit_oa_config(stream, config, oa_context(stream), NULL); // FIXME + if (!err) + config = xchg(&stream->oa_config, config); + else + ret = err; + } + + xe_oa_config_put(config); + + return ret; +} + +static long xe_oa_ioctl_locked(struct xe_oa_stream *stream, + unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case XE_OA_IOCTL_ENABLE: + xe_oa_enable_locked(stream); + return 0; + case XE_OA_IOCTL_DISABLE: + xe_oa_disable_locked(stream); + return 0; + case XE_OA_IOCTL_CONFIG: + return xe_oa_config_locked(stream, arg); + } + + return -EINVAL; +} + +static long xe_oa_ioctl(struct file *file, + unsigned int cmd, + unsigned long arg) +{ + struct xe_oa_stream *stream = file->private_data; + long ret; + + mutex_lock(&stream->lock); + ret = xe_oa_ioctl_locked(stream, cmd, arg); + mutex_unlock(&stream->lock); + + return ret; +} + +static void xe_oa_destroy_locked(struct xe_oa_stream *stream) +{ + if (stream->enabled) + xe_oa_disable_locked(stream); + + xe_oa_stream_destroy(stream); + + if (stream->engine) + xe_engine_put(stream->engine); // FIXME: check + + kfree(stream); +} + +static int xe_oa_release(struct inode *inode, struct file *file) +{ + struct xe_oa_stream *stream = file->private_data; + struct xe_gt *gt = stream->gt; + + /* + * Within this call, we know that the fd is being closed and we have no + * other user of stream->lock. Use the perf lock to destroy the stream + * here. + */ + mutex_lock(>->oa.lock); + xe_oa_destroy_locked(stream); + mutex_unlock(>->oa.lock); + + /* Release the reference the perf stream kept on the driver. */ + drm_dev_put(>->tile->xe->drm); + + return 0; +} + +static const struct file_operations fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .release = xe_oa_release, + .poll = xe_oa_poll, + .read = xe_oa_read, + .unlocked_ioctl = xe_oa_ioctl, + /* Our ioctl have no arguments, so it's safe to use the same function + * to handle 32bits compatibility. + */ + .compat_ioctl = xe_oa_ioctl, +}; + +static int +xe_oa_stream_open_ioctl_locked(struct xe_oa *oa, + struct drm_xe_oa_open_param *param, + struct perf_open_properties *props, + struct drm_file *file) +{ + struct xe_file *xef = to_xe_file(file); + struct xe_engine *engine = NULL; + struct xe_oa_stream *stream = NULL; + unsigned long f_flags = 0; + bool privileged_op = true; + int stream_fd; + int ret; + + if (props->single_context) { + u32 engine_id = props->ctx_handle; + + engine = xe_engine_lookup(xef, engine_id); + if (XE_IOCTL_ERR(oa->xe, !engine)) { + ret = -ENOENT; + goto err; + } + } + + /* + * For Gen12+ we gain a new OAR unit that only monitors the RCS on a + * per context basis. So we can relax requirements there if the user + * doesn't request global stream access (i.e. query based sampling + * using MI_RECORD_PERF_COUNT. + */ + if (GRAPHICS_VER(oa->xe) >= 12 && engine && + (props->sample_flags & SAMPLE_OA_REPORT) == 0) + privileged_op = false; + + if (props->hold_preemption) { + if (!props->single_context) { + drm_dbg(&oa->xe->drm, + "preemption disable with no context\n"); + ret = -EINVAL; + goto err; + } + privileged_op = true; + } + + // get_default_sseu_config(&props->sseu, props->engine); // FIXME + + /* Similar to perf's kernel.perf_paranoid_cpu sysctl option + * we check a dev.xe.perf_stream_paranoid sysctl option + * to determine if it's ok to access system wide OA counters + * without CAP_PERFMON or CAP_SYS_ADMIN privileges. + */ + if (privileged_op && + xe_oa_stream_paranoid && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, + "Insufficient privileges to open xe perf stream\n"); + ret = -EACCES; + goto err_engine; + } + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (!stream) { + ret = -ENOMEM; + goto err_engine; + } + + stream->xef = xef; + stream->oa = oa; + stream->engine = engine; + stream->poll_oa_period = props->poll_oa_period; + + ret = xe_oa_stream_init(stream, param, props); + if (ret) + goto err_alloc; + + /* we avoid simply assigning stream->sample_flags = props->sample_flags + * to have _stream_init check the combination of sample flags more + * thoroughly, but still this is the expected result at this point. + */ + if (WARN_ON(stream->sample_flags != props->sample_flags)) { + ret = -ENODEV; + goto err_flags; + } + + if (param->flags & XE_OA_FLAG_FD_CLOEXEC) + f_flags |= O_CLOEXEC; + if (param->flags & XE_OA_FLAG_FD_NONBLOCK) + f_flags |= O_NONBLOCK; + + stream_fd = anon_inode_getfd("[xe_oa]", &fops, stream, f_flags); + if (stream_fd < 0) { + ret = stream_fd; + goto err_flags; + } + + if (!(param->flags & XE_OA_FLAG_DISABLED)) + xe_oa_enable_locked(stream); + + /* Take a reference on the driver that will be kept with stream_fd + * until its release. + */ + drm_dev_get(&oa->xe->drm); + + return stream_fd; +err_flags: + xe_oa_stream_destroy(stream); +err_alloc: + kfree(stream); +err_engine: + if (engine) + xe_engine_put(engine); +err: + return ret; +} + +static u64 oa_exponent_to_ns(struct xe_oa *oa, int exponent) +{ + u64 nom = (2ULL << exponent) * NSEC_PER_SEC; + u32 den = xe_oa_timestamp_frequency(oa->xe); + + return div_u64(nom + den - 1, den); +} + +static bool oa_format_valid(struct xe_oa *oa, enum drm_xe_oa_format format) +{ + return test_bit(format, oa->format_mask); +} + +static void oa_format_add(struct xe_oa *oa, enum drm_xe_oa_format format) +{ + __set_bit(format, oa->format_mask); +} + +static int read_properties_unlocked(struct xe_oa *oa, + u64 __user *uprops, + u32 n_props, + struct perf_open_properties *props) +{ + const struct xe_oa_format *f; + u64 __user *uprop = uprops; + bool config_instance = false; + bool config_class = false; + u8 class, instance; + struct xe_gt *gt; + u32 i; + int ret; + + memset(props, 0, sizeof(struct perf_open_properties)); + props->poll_oa_period = DEFAULT_POLL_PERIOD_NS; + + /* Considering that ID = 0 is reserved and assuming that we don't + * (currently) expect any configurations to ever specify duplicate + * values for a particular property ID then the last _PROP_MAX value is + * one greater than the maximum number of properties we expect to get + * from userspace. + */ + if (!n_props || n_props >= DRM_XE_OA_PROP_MAX) { + drm_dbg(&oa->xe->drm, + "Invalid number of xe perf properties given\n"); + return -EINVAL; + } + + /* Defaults when class:instance is not passed */ + class = XE_ENGINE_CLASS_RENDER; + instance = 0; + + for (i = 0; i < n_props; i++) { + u64 oa_period, oa_freq_hz; + u64 id, value; + + ret = get_user(id, uprop); + if (ret) + return ret; + + ret = get_user(value, uprop + 1); + if (ret) + return ret; + + if (id == 0 || id >= DRM_XE_OA_PROP_MAX) { + drm_dbg(&oa->xe->drm, + "Unknown xe perf property ID\n"); + return -EINVAL; + } + + switch ((enum drm_xe_oa_property_id)id) { + case DRM_XE_OA_PROP_CTX_HANDLE: + props->single_context = 1; + props->ctx_handle = value; + break; + case DRM_XE_OA_PROP_SAMPLE_OA: + if (value) + props->sample_flags |= SAMPLE_OA_REPORT; + break; + case DRM_XE_OA_PROP_OA_METRICS_SET: + if (value == 0) { + drm_dbg(&oa->xe->drm, + "Unknown OA metric set ID\n"); + return -EINVAL; + } + props->metrics_set = value; + break; + case DRM_XE_OA_PROP_OA_FORMAT: + if (value == 0 || value >= XE_OA_FORMAT_MAX) { + drm_dbg(&oa->xe->drm, + "Out-of-range OA report format %llu\n", + value); + return -EINVAL; + } + if (!oa_format_valid(oa, value)) { + drm_dbg(&oa->xe->drm, + "Unsupported OA report format %llu\n", + value); + return -EINVAL; + } + props->oa_format = value; + break; + case DRM_XE_OA_PROP_OA_EXPONENT: + if (value > OA_EXPONENT_MAX) { + drm_dbg(&oa->xe->drm, + "OA timer exponent too high (> %u)\n", + OA_EXPONENT_MAX); + return -EINVAL; + } + + /* Theoretically we can program the OA unit to sample + * e.g. every 160ns for HSW, 167ns for BDW/SKL or 104ns + * for BXT. We don't allow such high sampling + * frequencies by default unless root. + */ + + BUILD_BUG_ON(sizeof(oa_period) != 8); + oa_period = oa_exponent_to_ns(oa, value); + + /* This check is primarily to ensure that oa_period <= + * UINT32_MAX (before passing to do_div which only + * accepts a u32 denominator), but we can also skip + * checking anything < 1Hz which implicitly can't be + * limited via an integer oa_max_sample_rate. + */ + if (oa_period <= NSEC_PER_SEC) { + u64 tmp = NSEC_PER_SEC; + do_div(tmp, oa_period); + oa_freq_hz = tmp; + } else + oa_freq_hz = 0; + + if (oa_freq_hz > xe_oa_max_sample_rate && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, + "OA exponent would exceed the max sampling frequency (sysctl dev.xe.oa_max_sample_rate) %uHz without CAP_PERFMON or CAP_SYS_ADMIN privileges\n", + xe_oa_max_sample_rate); + return -EACCES; + } + + props->oa_periodic = true; + props->oa_period_exponent = value; + break; + case DRM_XE_OA_PROP_HOLD_PREEMPTION: + props->hold_preemption = !!value; + break; + case DRM_XE_OA_PROP_GLOBAL_SSEU: + /* + * FIXME: Confirm this, on i915 supportd only for < 12.5 + * perf_open_properties.has_sseu is removed (always false) + */ + drm_dbg(&oa->xe->drm, "SSEU config not supported\n"); + return -ENODEV; + case DRM_XE_OA_PROP_POLL_OA_PERIOD: + if (value < 100000 /* 100us */) { + drm_dbg(&oa->xe->drm, + "OA availability timer too small (%lluns < 100us)\n", + value); + return -EINVAL; + } + props->poll_oa_period = value; + break; + case DRM_XE_OA_PROP_OA_ENGINE_CLASS: + class = (u8)value; + config_class = true; + break; + case DRM_XE_OA_PROP_OA_ENGINE_INSTANCE: + instance = (u8)value; + config_instance = true; + break; + default: + // MISSING_CASE(id); + return -EINVAL; + } + + uprop += 2; + } + + if ((config_class && !config_instance) || + (config_instance && !config_class)) { + drm_dbg(&oa->xe->drm, + "OA engine-class and engine-instance parameters must be passed together\n"); + return -EINVAL; + } + + for_each_gt(gt, oa->xe, i) { + props->hwe = xe_gt_hw_engine(gt, class, instance, false); + if (props->hwe) + break; + } + if (!props->hwe) { + drm_dbg(&oa->xe->drm, + "OA engine class and instance invalid %d:%d\n", + class, instance); + return -EINVAL; + } + + if (!engine_supports_oa(props->hwe)) { + drm_dbg(&oa->xe->drm, + "Engine not supported by OA %d:%d\n", + class, instance); + return -EINVAL; + } + +#if 0 // FIXME: Do this later + /* + * Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media + * C6 disable in BIOS. Fail if Media C6 is enabled on steppings where OAM + * does not work as expected. + */ + if (IS_MTL_MEDIA_STEP(props->engine->xe, STEP_A0, STEP_C0) && + props->engine->oa_group->type == TYPE_OAM && + intel_check_bios_c6_setup(&props->engine->gt->rc6)) { + drm_dbg(&oa->xe->drm, + "OAM requires media C6 to be disabled in BIOS\n"); + return -EINVAL; + } +#endif + i = array_index_nospec(props->oa_format, XE_OA_FORMAT_MAX); + f = &oa->oa_formats[i]; + + if (!props->oa_format || !engine_supports_oa_format(props->hwe, f->type)) { + drm_dbg(&oa->xe->drm, + "Invalid OA format %d for class %d\n", + f->type, props->hwe->class); + return -EINVAL; + } + + return 0; +} + +int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_oa *oa = &to_xe_device(dev)->oa; + struct drm_xe_oa_open_param *param = data; + struct perf_open_properties props; + u32 known_open_flags; + struct xe_gt *gt; + int ret; + + if (!oa->xe) { + drm_dbg(&oa->xe->drm, + "xe perf interface not available for this system\n"); + return -ENOTSUPP; + } + + known_open_flags = XE_OA_FLAG_FD_CLOEXEC | + XE_OA_FLAG_FD_NONBLOCK | + XE_OA_FLAG_DISABLED; + if (param->flags & ~known_open_flags) { + drm_dbg(&oa->xe->drm, + "Unknown drm_xe_oa_open_param flag\n"); + return -EINVAL; + } + + ret = read_properties_unlocked(oa, + u64_to_user_ptr(param->properties_ptr), + param->num_properties, + &props); + if (ret) + return ret; + + gt = props.hwe->gt; + + mutex_lock(>->oa.lock); + ret = xe_oa_stream_open_ioctl_locked(oa, param, &props, file); + mutex_unlock(>->oa.lock); + + return ret; +} + +void xe_oa_register(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + struct xe_gt *gt = xe_root_mmio_gt(xe); + + /* FIXME: needed? */ + if (!oa->xe) + return; + + /* To be sure we're synchronized with an attempted + * i915_perf_open_ioctl(); considering that we register after + * being exposed to userspace. + */ + mutex_lock(>->oa.lock); + + oa->metrics_kobj = + kobject_create_and_add("metrics", + &xe->drm.primary->kdev->kobj); + + mutex_unlock(>->oa.lock); +} + +void xe_oa_unregister(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + + if (!oa->metrics_kobj) + return; + + kobject_put(oa->metrics_kobj); + oa->metrics_kobj = NULL; +} + +static bool gen8_is_valid_flex_addr(struct xe_oa *oa, u32 addr) +{ + static const struct xe_reg flex_eu_regs[] = { + EU_PERF_CNTL0, + EU_PERF_CNTL1, + EU_PERF_CNTL2, + EU_PERF_CNTL3, + EU_PERF_CNTL4, + EU_PERF_CNTL5, + EU_PERF_CNTL6, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(flex_eu_regs); i++) { + if (flex_eu_regs[i].addr == addr) + return true; + } + return false; +} + +static bool reg_in_range_table(u32 addr, const struct xe_oa_range *table) +{ + while (table->start || table->end) { + if (addr >= table->start && addr <= table->end) + return true; + + table++; + } + + return false; +} + +static const struct xe_oa_range xehp_oa_b_counters[] = { + { .start = 0xdc48, .end = 0xdc48 }, /* OAA_ENABLE_REG */ + { .start = 0xdd00, .end = 0xdd48 }, /* OAG_LCE0_0 - OAA_LENABLE_REG */ +}; + +static const struct xe_oa_range gen12_oa_b_counters[] = { + { .start = 0x2b2c, .end = 0x2b2c }, /* GEN12_OAG_OA_PESS */ + { .start = 0xd900, .end = 0xd91c }, /* GEN12_OAG_OASTARTTRIG[1-8] */ + { .start = 0xd920, .end = 0xd93c }, /* GEN12_OAG_OAREPORTTRIG1[1-8] */ + { .start = 0xd940, .end = 0xd97c }, /* GEN12_OAG_CEC[0-7][0-1] */ + { .start = 0xdc00, .end = 0xdc3c }, /* GEN12_OAG_SCEC[0-7][0-1] */ + { .start = 0xdc40, .end = 0xdc40 }, /* GEN12_OAG_SPCTR_CNF */ + { .start = 0xdc44, .end = 0xdc44 }, /* GEN12_OAA_DBG_REG */ + {} +}; + +static const struct xe_oa_range mtl_oam_b_counters[] = { + { .start = 0x393000, .end = 0x39301c }, /* GEN12_OAM_STARTTRIG1[1-8] */ + { .start = 0x393020, .end = 0x39303c }, /* GEN12_OAM_REPORTTRIG1[1-8] */ + { .start = 0x393040, .end = 0x39307c }, /* GEN12_OAM_CEC[0-7][0-1] */ + { .start = 0x393200, .end = 0x39323C }, /* MPES[0-7] */ + {} +}; + +/* FIXME: Checks below have been simplified/loosened for now compared with i915 */ +static bool xehp_is_valid_b_counter_addr(struct xe_oa *oa, u32 addr) +{ + return reg_in_range_table(addr, xehp_oa_b_counters) || + reg_in_range_table(addr, gen12_oa_b_counters) || + reg_in_range_table(addr, mtl_oam_b_counters); +} + +/* + * Ref: 14010536224: + * 0x20cc is repurposed on MTL, so use a separate array for MTL. + */ +static const struct xe_oa_range mtl_oa_mux_regs[] = { + { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ + { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ + { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ + { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ + { .start = 0x38d100, .end = 0x38d114}, /* VISACTL */ + {} +}; + +static const struct xe_oa_range gen12_oa_mux_regs[] = { + { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ + { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ + { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ + { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ + { .start = 0x20cc, .end = 0x20cc }, /* WAIT_FOR_RC6_EXIT */ + {} +}; + +static bool gen12_is_valid_mux_addr(struct xe_oa *oa, u32 addr) +{ + if (oa->xe->info.platform == XE_METEORLAKE) + return reg_in_range_table(addr, mtl_oa_mux_regs); + else + return reg_in_range_table(addr, gen12_oa_mux_regs); +} + +static u32 mask_reg_value(u32 reg, u32 val) +{ + /* HALF_SLICE_CHICKEN2 is programmed with a the + * WaDisableSTUnitPowerOptimization workaround. Make sure the value + * programmed by userspace doesn't change this. + */ + if (REG_EQUAL_MCR(reg, HALF_SLICE_CHICKEN2)) + val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE); + + /* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function + * indicated by its name and a bunch of selection fields used by OA + * configs. + */ + if (REG_EQUAL(reg, WAIT_FOR_RC6_EXIT)) + val = val & ~_MASKED_BIT_ENABLE(HSW_WAIT_FOR_RC6_EXIT_ENABLE); + + return val; +} + +static struct xe_oa_reg *alloc_oa_regs(struct xe_oa *oa, + bool (*is_valid)(struct xe_oa *oa, u32 addr), + u32 __user *regs, + u32 n_regs) +{ + struct xe_oa_reg *oa_regs; + int err; + u32 i; + + if (!n_regs) + return NULL; + + /* No is_valid function means we're not allowing any register to be programmed. */ + BUG_ON(!is_valid); + if (!is_valid) + return ERR_PTR(-EINVAL); + + oa_regs = kmalloc_array(n_regs, sizeof(*oa_regs), GFP_KERNEL); + if (!oa_regs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < n_regs; i++) { + u32 addr, value; + + err = get_user(addr, regs); + if (err) + goto addr_err; + + if (!is_valid(oa, addr)) { + drm_dbg(&oa->xe->drm, + "Invalid oa_reg address: %X\n", addr); + err = -EINVAL; + goto addr_err; + } + + err = get_user(value, regs + 1); + if (err) + goto addr_err; + + oa_regs[i].addr = XE_REG(addr); + oa_regs[i].value = mask_reg_value(addr, value); + + regs += 2; + } + + return oa_regs; + +addr_err: + kfree(oa_regs); + return ERR_PTR(err); +} + +static ssize_t show_dynamic_id(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct xe_oa_config *oa_config = + container_of(attr, typeof(*oa_config), sysfs_metric_id); + + return sprintf(buf, "%d\n", oa_config->id); +} + +static int create_dynamic_oa_sysfs_entry(struct xe_oa *oa, + struct xe_oa_config *oa_config) +{ + sysfs_attr_init(&oa_config->sysfs_metric_id.attr); + oa_config->sysfs_metric_id.attr.name = "id"; + oa_config->sysfs_metric_id.attr.mode = S_IRUGO; + oa_config->sysfs_metric_id.show = show_dynamic_id; + oa_config->sysfs_metric_id.store = NULL; + + oa_config->attrs[0] = &oa_config->sysfs_metric_id.attr; + oa_config->attrs[1] = NULL; + + oa_config->sysfs_metric.name = oa_config->uuid; + oa_config->sysfs_metric.attrs = oa_config->attrs; + + return sysfs_create_group(oa->metrics_kobj, + &oa_config->sysfs_metric); +} + +int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_oa *oa = &to_xe_device(dev)->oa; + struct drm_xe_oa_config *args = data; + struct xe_oa_config *oa_config, *tmp; + struct xe_oa_reg *regs; + int err, id; + + if (!oa->xe) { + drm_dbg(&oa->xe->drm, + "xe oa interface not available for this system\n"); + return -ENOTSUPP; + } + + if (!oa->metrics_kobj) { + drm_dbg(&oa->xe->drm, + "OA metrics weren't advertised via sysfs\n"); + return -EINVAL; + } + + if (xe_oa_stream_paranoid && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, + "Insufficient privileges to add xe OA config\n"); + return -EACCES; + } + + if ((!args->mux_regs_ptr || !args->n_mux_regs) && + (!args->boolean_regs_ptr || !args->n_boolean_regs) && + (!args->flex_regs_ptr || !args->n_flex_regs)) { + drm_dbg(&oa->xe->drm, + "No OA registers given\n"); + return -EINVAL; + } + + oa_config = kzalloc(sizeof(*oa_config), GFP_KERNEL); + if (!oa_config) { + drm_dbg(&oa->xe->drm, + "Failed to allocate memory for the OA config\n"); + return -ENOMEM; + } + + oa_config->oa = oa; + kref_init(&oa_config->ref); + + if (!uuid_is_valid(args->uuid)) { + drm_dbg(&oa->xe->drm, + "Invalid uuid format for OA config\n"); + err = -EINVAL; + goto reg_err; + } + + /* Last character in oa_config->uuid will be 0 because oa_config is + * kzalloc. + */ + memcpy(oa_config->uuid, args->uuid, sizeof(args->uuid)); + + oa_config->mux_regs_len = args->n_mux_regs; + regs = alloc_oa_regs(oa, + gen12_is_valid_mux_addr, + u64_to_user_ptr(args->mux_regs_ptr), + args->n_mux_regs); + + if (IS_ERR(regs)) { + drm_dbg(&oa->xe->drm, + "Failed to create OA config for mux_regs\n"); + err = PTR_ERR(regs); + goto reg_err; + } + oa_config->mux_regs = regs; + + oa_config->b_counter_regs_len = args->n_boolean_regs; + regs = alloc_oa_regs(oa, + xehp_is_valid_b_counter_addr, + u64_to_user_ptr(args->boolean_regs_ptr), + args->n_boolean_regs); + + if (IS_ERR(regs)) { + drm_dbg(&oa->xe->drm, + "Failed to create OA config for b_counter_regs\n"); + err = PTR_ERR(regs); + goto reg_err; + } + oa_config->b_counter_regs = regs; + + oa_config->flex_regs_len = args->n_flex_regs; + regs = alloc_oa_regs(oa, + gen8_is_valid_flex_addr, + u64_to_user_ptr(args->flex_regs_ptr), + args->n_flex_regs); + + if (IS_ERR(regs)) { + drm_dbg(&oa->xe->drm, + "Failed to create OA config for flex_regs\n"); + err = PTR_ERR(regs); + goto reg_err; + } + oa_config->flex_regs = regs; + + err = mutex_lock_interruptible(&oa->metrics_lock); + if (err) + goto reg_err; + + /* We shouldn't have too many configs, so this iteration shouldn't be + * too costly. + */ + idr_for_each_entry(&oa->metrics_idr, tmp, id) { + if (!strcmp(tmp->uuid, oa_config->uuid)) { + drm_dbg(&oa->xe->drm, + "OA config already exists with this uuid\n"); + err = -EADDRINUSE; + goto sysfs_err; + } + } + + err = create_dynamic_oa_sysfs_entry(oa, oa_config); + if (err) { + drm_dbg(&oa->xe->drm, + "Failed to create sysfs entry for OA config\n"); + goto sysfs_err; + } + + /* Config id 0 is invalid, id 1 for kernel stored test config. */ + oa_config->id = idr_alloc(&oa->metrics_idr, + oa_config, 2, + 0, GFP_KERNEL); + if (oa_config->id < 0) { + drm_dbg(&oa->xe->drm, + "Failed to create sysfs entry for OA config\n"); + err = oa_config->id; + goto sysfs_err; + } + + mutex_unlock(&oa->metrics_lock); + + drm_dbg(&oa->xe->drm, + "Added config %s id=%i\n", oa_config->uuid, oa_config->id); + + return oa_config->id; + +sysfs_err: + mutex_unlock(&oa->metrics_lock); +reg_err: + xe_oa_config_put(oa_config); + drm_dbg(&oa->xe->drm, + "Failed to add new OA config\n"); + return err; +} + +int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct xe_oa *oa = &to_xe_device(dev)->oa; + struct xe_oa_config *oa_config; + u64 *arg = data; + int ret; + + if (!oa->xe) { + drm_dbg(&oa->xe->drm, + "xe oa interface not available for this system\n"); + return -ENOTSUPP; + } + + if (xe_oa_stream_paranoid && !perfmon_capable()) { + drm_dbg(&oa->xe->drm, + "Insufficient privileges to remove xe OA config\n"); + return -EACCES; + } + + ret = mutex_lock_interruptible(&oa->metrics_lock); + if (ret) + return ret; + + oa_config = idr_find(&oa->metrics_idr, *arg); + if (!oa_config) { + drm_dbg(&oa->xe->drm, + "Failed to remove unknown OA config\n"); + ret = -ENOENT; + goto err_unlock; + } + + BUG_ON(*arg != oa_config->id); + + sysfs_remove_group(oa->metrics_kobj, &oa_config->sysfs_metric); + + idr_remove(&oa->metrics_idr, *arg); + + mutex_unlock(&oa->metrics_lock); + + drm_dbg(&oa->xe->drm, + "Removed config %s id=%i\n", oa_config->uuid, oa_config->id); + + xe_oa_config_put(oa_config); + + return 0; + +err_unlock: + mutex_unlock(&oa->metrics_lock); + return ret; +} + +static struct ctl_table oa_table[] = { + { + .procname = "perf_stream_paranoid", + .data = &xe_oa_stream_paranoid, + .maxlen = sizeof(xe_oa_stream_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "oa_max_sample_rate", + .data = &xe_oa_max_sample_rate, + .maxlen = sizeof(xe_oa_max_sample_rate), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &oa_sample_rate_hard_limit, + }, + {} +}; + +static u32 num_perf_groups_per_gt(struct xe_gt *gt) +{ + return 1; +} + +static u32 __oam_engine_group(struct xe_hw_engine *hwe) +{ + if (GRAPHICS_VERx100(gt_to_xe(hwe->gt)) >= 1270) { + /* + * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices + * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA. + */ + drm_WARN_ON(&hwe->gt->tile->xe->drm, + hwe->gt->info.type != XE_GT_TYPE_MEDIA); + + return OA_GROUP_OAM_SAMEDIA_0; + } + + return OA_GROUP_INVALID; +} + +static u32 __oa_engine_group(struct xe_hw_engine *hwe) +{ + switch (hwe->class) { + case XE_ENGINE_CLASS_RENDER: + return OA_GROUP_OAG; + + case XE_ENGINE_CLASS_VIDEO_DECODE: + case XE_ENGINE_CLASS_VIDEO_ENHANCE: + return __oam_engine_group(hwe); + + default: + return OA_GROUP_INVALID; + } +} + +static struct xe_oa_regs __oam_regs(u32 base) +{ + return (struct xe_oa_regs) { + base, + GEN12_OAM_HEAD_POINTER(base), + GEN12_OAM_TAIL_POINTER(base), + GEN12_OAM_BUFFER(base), + GEN12_OAM_CONTEXT_CONTROL(base), + GEN12_OAM_CONTROL(base), + GEN12_OAM_DEBUG(base), + GEN12_OAM_STATUS(base), + GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT, + }; +} + +static struct xe_oa_regs __oag_regs(void) +{ + return (struct xe_oa_regs) { + 0, + GEN12_OAG_OAHEADPTR, + GEN12_OAG_OATAILPTR, + GEN12_OAG_OABUFFER, + GEN12_OAG_OAGLBCTXCTRL, + GEN12_OAG_OACONTROL, + GEN12_OAG_OA_DEBUG, + GEN12_OAG_OASTATUS, + GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT, + }; +} + +static void oa_init_groups(struct xe_gt *gt) +{ + int i, num_groups = gt->oa.num_perf_groups; + + for (i = 0; i < num_groups; i++) { + struct xe_oa_group *g = >->oa.group[i]; + + /* Fused off engines can result in a group with num_engines == 0 */ + if (g->num_engines == 0) + continue; + + if (i == OA_GROUP_OAG && gt->info.type != XE_GT_TYPE_MEDIA) { + g->regs = __oag_regs(); + g->type = TYPE_OAG; + } else if (GRAPHICS_VERx100(gt_to_xe(gt)) >= 1270) { + g->regs = __oam_regs(mtl_oa_base[i]); + g->type = TYPE_OAM; + } + + /* Set oa_unit_ids now to ensure ids remain contiguous. */ + g->oa_unit_id = gt->tile->xe->oa.oa_unit_ids++; + } +} + +static int oa_init_gt(struct xe_gt *gt) +{ + u32 num_groups = num_perf_groups_per_gt(gt); + struct xe_hw_engine *hwe; + enum xe_hw_engine_id id; + struct xe_oa_group *g; + + g = kcalloc(num_groups, sizeof(*g), GFP_KERNEL); + if (!g) + return -ENOMEM; + + for_each_hw_engine(hwe, gt, id) { + u32 index = __oa_engine_group(hwe); + + hwe->oa_group = NULL; + if (index < num_groups) { + g[index].num_engines++; + hwe->oa_group = &g[index]; + } + } + + gt->oa.num_perf_groups = num_groups; + gt->oa.group = g; + + oa_init_groups(gt); + + return 0; +} + +static int oa_init_engine_groups(struct xe_oa *oa) +{ + struct xe_gt *gt; + int i, ret; + + for_each_gt(gt, oa->xe, i) { + ret = oa_init_gt(gt); + if (ret) + return ret; + } + + return 0; +} + +static void oa_init_supported_formats(struct xe_oa *oa) +{ + switch (oa->xe->info.platform) { + case XE_ALDERLAKE_S: + case XE_ALDERLAKE_P: + oa_format_add(oa, XE_OA_FORMAT_A12); + oa_format_add(oa, XE_OA_FORMAT_A12_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_C4_B8); + break; + + case XE_DG2: + oa_format_add(oa, XE_OAR_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_A24u40_A14u32_B8_C8); + break; + + case XE_METEORLAKE: + oa_format_add(oa, XE_OAR_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(oa, XE_OA_FORMAT_A24u40_A14u32_B8_C8); + oa_format_add(oa, XE_OAM_FORMAT_MPEC8u64_B8_C8); + oa_format_add(oa, XE_OAM_FORMAT_MPEC8u32_B8_C8); + break; + + default: + drm_err(&oa->xe->drm, "Unknown platform\n"); + } +} + +static void xe_oa_init_info(struct xe_device *xe) +{ +} + +int xe_oa_init(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + struct xe_gt *gt; + int i, ret; + + oa->xe = xe; + oa->oa_formats = oa_formats; + xe_oa_init_info(xe); + + for_each_gt(gt, xe, i) + mutex_init(>->oa.lock); + + /* Choose a representative limit */ + oa_sample_rate_hard_limit = xe_root_mmio_gt(xe)->info.clock_freq / 2; + + mutex_init(&oa->metrics_lock); + idr_init_base(&oa->metrics_idr, 1); + + /* We set up some ratelimit state to potentially throttle any + * _NOTES about spurious, invalid OA reports which we don't + * forward to userspace. + * + * We print a _NOTE about any throttling when closing the + * stream instead of waiting until driver _fini which no one + * would ever see. + * + * Using the same limiting factors as printk_ratelimit() + */ + ratelimit_state_init(&oa->spurious_report_rs, 5 * HZ, 10); + /* Since we use a DRM_NOTE for spurious reports it would be + * inconsistent to let __ratelimit() automatically print a + * warning for throttling. + */ + ratelimit_set_flags(&oa->spurious_report_rs, RATELIMIT_MSG_ON_RELEASE); + ratelimit_state_init(&oa->tail_pointer_race, 5 * HZ, 10); + ratelimit_set_flags(&oa->tail_pointer_race,RATELIMIT_MSG_ON_RELEASE); + atomic64_set(&oa->noa_programming_delay, 500 * 1000 /* 500us */); + + ret = oa_init_engine_groups(oa); + if (ret) { + drm_err(&xe->drm, "OA initialization failed %d\n", ret); + return ret; + } + + oa_init_supported_formats(oa); + + oa->xe = xe; + return 0; +} + +static int destroy_config(int id, void *p, void *data) +{ + xe_oa_config_put(p); + return 0; +} + +int xe_oa_sysctl_register(void) +{ + sysctl_header = register_sysctl("dev/xe", oa_table); + return 0; +} + +void xe_oa_sysctl_unregister(void) +{ + unregister_sysctl_table(sysctl_header); +} + +void xe_oa_fini(struct xe_device *xe) +{ + struct xe_oa *oa = &xe->oa; + struct xe_gt *gt; + int i; + + if (!oa->xe) + return; + + for_each_gt(gt, xe, i) + kfree(gt->oa.group); + + idr_for_each(&oa->metrics_idr, destroy_config, oa); + idr_destroy(&oa->metrics_idr); + + oa->xe = NULL; +} + +int xe_oa_ioctl_version(struct xe_device *xe) +{ + return 7; // FIXME +} diff --git a/drivers/gpu/drm/xe/xe_oa.h b/drivers/gpu/drm/xe/xe_oa.h new file mode 100644 index 0000000000000..d13b75fa0256c --- /dev/null +++ b/drivers/gpu/drm/xe/xe_oa.h @@ -0,0 +1,402 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef _XE_OA_H_ +#define _XE_OA_H_ + +#include +#include +#include "regs/xe_reg_defs.h" + +struct drm_device; +struct drm_file; + +enum { + OA_GROUP_OAG = 0, + OA_GROUP_OAM_SAMEDIA_0 = 0, + + OA_GROUP_MAX, + OA_GROUP_INVALID = U32_MAX, +}; + +enum oa_type { + TYPE_OAG, + TYPE_OAM, +}; + +enum report_header { + HDR_32_BIT = 0, + HDR_64_BIT, +}; + +struct xe_oa_format { + u32 format; + int size; + int type; + enum report_header header; +}; + +struct xe_oa_reg { + struct xe_reg addr; + u32 value; +}; + +struct xe_oa_range { + u32 start; + u32 end; +}; + +struct xe_oa_config { + struct xe_oa *oa; + + char uuid[UUID_STRING_LEN + 1]; + int id; + + const struct xe_oa_reg *mux_regs; + u32 mux_regs_len; + const struct xe_oa_reg *b_counter_regs; + u32 b_counter_regs_len; + const struct xe_oa_reg *flex_regs; + u32 flex_regs_len; + + struct attribute_group sysfs_metric; + struct attribute *attrs[2]; + struct kobj_attribute sysfs_metric_id; + + struct kref ref; + struct rcu_head rcu; +}; + +struct xe_oa_regs { + u32 base; + struct xe_reg oa_head_ptr; + struct xe_reg oa_tail_ptr; + struct xe_reg oa_buffer; + struct xe_reg oa_ctx_ctrl; + struct xe_reg oa_ctrl; + struct xe_reg oa_debug; + struct xe_reg oa_status; + u32 oa_ctrl_counter_format_shift; +}; + +struct xe_oa_group { + /* + * @type: Identifier for the OA unit. + */ + u32 oa_unit_id; + + /* + * @exclusive_stream: The stream currently using the OA unit. This is + * sometimes accessed outside a syscall associated to its file + * descriptor. + */ + struct xe_oa_stream *exclusive_stream; + + /* + * @num_engines: The number of engines using this OA unit. + */ + u32 num_engines; + + /* + * @regs: OA buffer register group for programming the OA unit. + */ + struct xe_oa_regs regs; + + /* + * @type: Type of OA unit - OAM, OAG etc. + */ + enum oa_type type; +}; + +struct xe_oa_gt { + /* + * Lock associated with anything below within this structure. + */ + struct mutex lock; + + /** FIXME + * @sseu: sseu configuration selected to run while perf is active, + * applies to all contexts. + */ + // struct intel_sseu sseu; + + /** + * @num_perf_groups: number of perf groups per gt. + */ + u32 num_perf_groups; + + /* + * @group: list of OA groups - one for each OA buffer. + */ + struct xe_oa_group *group; +}; + +struct xe_oa { + struct xe_device *xe; + + struct kobject *metrics_kobj; + + /* + * Lock associated with adding/modifying/removing OA configs + * in perf->metrics_idr. + */ + struct mutex metrics_lock; + + /* + * List of dynamic configurations (struct i915_oa_config), you + * need to hold perf->metrics_lock to access it. + */ + struct idr metrics_idr; + + /** + * For rate limiting any notifications of spurious + * invalid OA reports + */ + struct ratelimit_state spurious_report_rs; + + /** + * For rate limiting any notifications of tail pointer + * race. + */ + struct ratelimit_state tail_pointer_race; + + // u32 gen7_latched_oastatus1; // FIXME + u32 ctx_oactxctrl_offset; + u32 ctx_flexeu0_offset; + + u32 gen8_valid_ctx_bit; // FIXME: deleted + + // struct i915_oa_ops ops; // FIXME: these are deleted + + const struct xe_oa_format *oa_formats; + +#define FORMAT_MASK_SIZE DIV_ROUND_UP(XE_OA_FORMAT_MAX - 1, BITS_PER_LONG) + unsigned long format_mask[FORMAT_MASK_SIZE]; + + atomic64_t noa_programming_delay; + + /* oa unit ids */ + u32 oa_unit_ids; +}; + +/** + * struct xe_perf_stream - state for a single open stream FD + */ +struct xe_oa_stream { + /** + * @xef: xe_file associated with oa stream + * FIXME: is it ok to do this? Otherwise modify functions to pass this in where needed. + */ + struct xe_file *xef; + + /** + * @perf: xe_oa backpointer + */ + struct xe_oa *oa; + + /** + * @gt: gt + */ + struct xe_gt *gt; + + /** + * FIXME: struct xe_hw_engine instead of intel_engine_cs + * @hwe: hardware engine associated with this performance stream. + */ + struct xe_hw_engine *hwe; + + /** + * @lock: Lock associated with operations on stream + */ + struct mutex lock; + + /** + * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*` + * properties given when opening a stream, representing the contents + * of a single sample as read() by userspace. + */ + u32 sample_flags; + + /** + * @sample_size: Considering the configured contents of a sample + * combined with the required header size, this is the total size + * of a single sample record. + */ + int sample_size; + + /** + * FIXME: struct xe_engine instead of i915_gem_context + * @engine: %NULL if measuring system-wide across all contexts or a + * specific context that is being monitored. + */ + struct xe_engine *engine; + + /** + * @enabled: Whether the stream is currently enabled, considering + * whether the stream was opened in a disabled state and based + * on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE` calls. + */ + bool enabled; + + /** + * @hold_preemption: Whether preemption is put on hold for command + * submissions done on the @ctx. This is useful for some drivers that + * cannot easily post process the OA buffer context to subtract delta + * of performance counters not associated with @ctx. + */ + bool hold_preemption; + + /** + * @ops: The callbacks providing the implementation of this specific + * type of configured stream. + */ + // const struct xe_perf_stream_ops *ops; // FIXME: these are deleted + + /** + * @oa_config: The OA configuration used by the stream. + */ + struct xe_oa_config *oa_config; + + /** + * @oa_config_bos: A list of struct i915_oa_config_bo allocated lazily + * each time @oa_config changes. + */ + struct llist_head oa_config_bos; + + /** + * @pinned_ctx: The OA context specific information. + * FIXME: not needed for xe, should be 'struct xe_lrc *' if needed + */ + // struct intel_context *pinned_ctx; + + /** + * @specific_ctx_id: The id of the specific context. + */ + u32 specific_ctx_id; + + /** + * @specific_ctx_id_mask: The mask used to masking specific_ctx_id bits. + */ + u32 specific_ctx_id_mask; + + /** + * @poll_check_timer: High resolution timer that will periodically + * check for data in the circular OA buffer for notifying userspace + * (e.g. during a read() or poll()). + */ + struct hrtimer poll_check_timer; + + /** + * @poll_wq: The wait queue that hrtimer callback wakes when it + * sees data ready to read in the circular OA buffer. + */ + wait_queue_head_t poll_wq; + + /** + * @pollin: Whether there is data available to read. + */ + bool pollin; + + /** + * @periodic: Whether periodic sampling is currently enabled. + */ + bool periodic; + + /** + * @period_exponent: The OA unit sampling frequency is derived from this. + */ + int period_exponent; + + /** + * @oa_buffer: State of the OA buffer. + */ + struct { + const struct xe_oa_format *format; + struct xe_bo *bo; + // struct xe_vma *vma; + u8 *vaddr; + u32 last_ctx_id; + int size_exponent; + + /** + * @ptr_lock: Locks reads and writes to all head/tail state + * + * Consider: the head and tail pointer state needs to be read + * consistently from a hrtimer callback (atomic context) and + * read() fop (user context) with tail pointer updates happening + * in atomic context and head updates in user context and the + * (unlikely) possibility of read() errors needing to reset all + * head/tail state. + * + * Note: Contention/performance aren't currently a significant + * concern here considering the relatively low frequency of + * hrtimer callbacks (5ms period) and that reads typically only + * happen in response to a hrtimer event and likely complete + * before the next callback. + * + * Note: This lock is not held *while* reading and copying data + * to userspace so the value of head observed in htrimer + * callbacks won't represent any partial consumption of data. + */ + spinlock_t ptr_lock; + + /** + * @head: Although we can always read back the head pointer register, + * we prefer to avoid trusting the HW state, just to avoid any + * risk that some hardware condition could * somehow bump the + * head pointer unpredictably and cause us to forward the wrong + * OA buffer data to userspace. + */ + u32 head; + + /** + * @tail: The last verified tail that can be read by userspace. + */ + u32 tail; + } oa_buffer; + + /** + * @noa_wait: A batch buffer doing a wait on the GPU for the NOA + * logic to be reprogrammed. + */ + struct xe_bo *noa_wait; + + /** + * @poll_oa_period: The period in nanoseconds at which the OA + * buffer should be checked for available data. + */ + u64 poll_oa_period; + + /** + * @override_gucrc: GuC RC has been overridden for the perf stream, + * and we need to restore the default configuration on release. + */ + bool override_gucrc; +}; + +/* Below __UNUSED__ refers to exported oa functions not called from other parts of xe */ +int xe_oa_init(struct xe_device *xe); +void xe_oa_fini(struct xe_device *xe); +void xe_oa_register(struct xe_device *xe); +void xe_oa_unregister(struct xe_device *xe); +int xe_oa_ioctl_version(struct xe_device *xe); +int xe_oa_sysctl_register(void); +void xe_oa_sysctl_unregister(void); + +int xe_oa_stream_open_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_oa_add_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +int xe_oa_remove_config_ioctl(struct drm_device *dev, void *data, + struct drm_file *file); +void xe_oa_init_reg_state(void); // __UNUSED__ + +struct xe_oa_config *xe_oa_get_oa_config(struct xe_oa *oa, int metrics_set); // __UNUSED__ +struct xe_oa_config *xe_oa_config_get(struct xe_oa_config *oa_config); // __UNUSED__ +void xe_oa_config_put(struct xe_oa_config *oa_config); // __UNUSED__ + +u32 xe_oa_timestamp_frequency(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c index 9acbb27dfcab9..895663dbf761b 100644 --- a/drivers/gpu/drm/xe/xe_query.c +++ b/drivers/gpu/drm/xe/xe_query.c @@ -77,7 +77,10 @@ static int query_engines(struct xe_device *xe, xe_to_user_engine_class[hwe->class]; hw_engine_info[i].engine_instance = hwe->logical_instance; - hw_engine_info[i++].gt_id = gt->info.id; + hw_engine_info[i].gt_id = gt->info.id; + hw_engine_info[i++].oa_unit_id = + hwe->oa_group && hwe->oa_group->num_engines ? + hwe->oa_group->oa_unit_id : U32_MAX; } if (copy_to_user(query_ptr, hw_engine_info, size)) { @@ -203,6 +206,7 @@ static int query_config(struct xe_device *xe, struct drm_xe_device_query *query) hweight_long(xe->info.mem_region_mask); config->info[XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY] = xe_engine_device_get_max_priority(xe); + config->info[XE_QUERY_OA_IOCTL_VERSION] = xe_oa_ioctl_version(xe); if (copy_to_user(query_ptr, config, size)) { kfree(config); @@ -244,6 +248,7 @@ static int query_gts(struct xe_device *xe, struct drm_xe_device_query *query) gts->gts[id].type = XE_QUERY_GT_TYPE_MAIN; gts->gts[id].instance = id; gts->gts[id].clock_freq = gt->info.clock_freq; + gts->gts[id].oa_timestamp_freq = xe_oa_timestamp_frequency(xe); if (!IS_DGFX(xe)) gts->gts[id].native_mem_regions = 0x1; else diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index e890b131af918..44219696cfff2 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -101,6 +101,9 @@ struct xe_user_extension { #define DRM_XE_WAIT_USER_FENCE 0x0b #define DRM_XE_VM_MADVISE 0x0c #define DRM_XE_ENGINE_GET_PROPERTY 0x0d +#define DRM_XE_OA_OPEN 0x36 +#define DRM_XE_OA_ADD_CONFIG 0x37 +#define DRM_XE_OA_REMOVE_CONFIG 0x38 /* Must be kept compact -- no holes */ #define DRM_IOCTL_XE_DEVICE_QUERY DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_DEVICE_QUERY, struct drm_xe_device_query) @@ -117,6 +120,9 @@ struct xe_user_extension { #define DRM_IOCTL_XE_ENGINE_SET_PROPERTY DRM_IOW(DRM_COMMAND_BASE + DRM_XE_ENGINE_SET_PROPERTY, struct drm_xe_engine_set_property) #define DRM_IOCTL_XE_WAIT_USER_FENCE DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence) #define DRM_IOCTL_XE_VM_MADVISE DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise) +#define DRM_IOCTL_XE_OA_OPEN DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OA_OPEN, struct drm_xe_oa_open_param) +#define DRM_IOCTL_XE_OA_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OA_ADD_CONFIG, struct drm_xe_oa_config) +#define DRM_IOCTL_XE_OA_REMOVE_CONFIG DRM_IOW(DRM_COMMAND_BASE + DRM_XE_OA_REMOVE_CONFIG, __u64) /** * enum drm_xe_memory_class - Supported memory classes. @@ -223,7 +229,8 @@ struct drm_xe_query_config { #define XE_QUERY_CONFIG_GT_COUNT 4 #define XE_QUERY_CONFIG_MEM_REGION_COUNT 5 #define XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY 6 -#define XE_QUERY_CONFIG_NUM_PARAM (XE_QUERY_CONFIG_MAX_ENGINE_PRIORITY + 1) +#define XE_QUERY_OA_IOCTL_VERSION 7 +#define XE_QUERY_CONFIG_NUM_PARAM (XE_QUERY_OA_IOCTL_VERSION + 1) /** @info: array of elements containing the config info */ __u64 info[]; }; @@ -260,6 +267,7 @@ struct drm_xe_query_gts { __u64 native_mem_regions; /* bit mask of instances from drm_xe_query_mem_usage */ __u64 slow_mem_regions; /* bit mask of instances from drm_xe_query_mem_usage */ __u64 inaccessible_mem_regions; /* bit mask of instances from drm_xe_query_mem_usage */ + __u64 oa_timestamp_freq; __u64 reserved[8]; } gts[]; }; @@ -699,6 +707,7 @@ struct drm_xe_engine_class_instance { __u16 engine_instance; __u16 gt_id; + __u16 oa_unit_id; }; struct drm_xe_engine_create { @@ -1002,6 +1011,280 @@ struct drm_xe_vm_madvise { __u64 reserved[2]; }; +enum drm_xe_oa_format { + XE_OA_FORMAT_C4_B8 = 7, + + /* Gen8+ */ + XE_OA_FORMAT_A12, + XE_OA_FORMAT_A12_B8_C8, + XE_OA_FORMAT_A32u40_A4u32_B8_C8, + + /* DG2 */ + XE_OAR_FORMAT_A32u40_A4u32_B8_C8, + XE_OA_FORMAT_A24u40_A14u32_B8_C8, + + /* MTL OAM */ + XE_OAM_FORMAT_MPEC8u64_B8_C8, + XE_OAM_FORMAT_MPEC8u32_B8_C8, + + XE_OA_FORMAT_MAX /* non-ABI */ +}; + +enum drm_xe_oa_property_id { + /** + * Open the stream for a specific context handle (as used with + * execbuffer2). A stream opened for a specific context this way + * won't typically require root privileges. + * + * This property is available in perf revision 1. + */ + DRM_XE_OA_PROP_CTX_HANDLE = 1, + + /** + * A value of 1 requests the inclusion of raw OA unit reports as + * part of stream samples. + * + * This property is available in perf revision 1. + */ + DRM_XE_OA_PROP_SAMPLE_OA, + + /** + * The value specifies which set of OA unit metrics should be + * configured, defining the contents of any OA unit reports. + * + * This property is available in perf revision 1. + */ + DRM_XE_OA_PROP_OA_METRICS_SET, + + /** + * The value specifies the size and layout of OA unit reports. + * + * This property is available in perf revision 1. + */ + DRM_XE_OA_PROP_OA_FORMAT, + + /** + * Specifying this property implicitly requests periodic OA unit + * sampling and (at least on Haswell) the sampling frequency is derived + * from this exponent as follows: + * + * 80ns * 2^(period_exponent + 1) + * + * This property is available in perf revision 1. + */ + DRM_XE_OA_PROP_OA_EXPONENT, + + /** + * Specifying this property is only valid when specify a context to + * filter with DRM_XE_OA_PROP_CTX_HANDLE. Specifying this property + * will hold preemption of the particular context we want to gather + * performance data about. The execbuf2 submissions must include a + * drm_xe_gem_execbuffer_ext_perf parameter for this to apply. + * + * This property is available in perf revision 3. + */ + DRM_XE_OA_PROP_HOLD_PREEMPTION, + + /** + * Specifying this pins all contexts to the specified SSEU power + * configuration for the duration of the recording. + * + * This parameter's value is a pointer to a struct + * drm_xe_gem_context_param_sseu. + * + * This property is available in perf revision 4. + */ + DRM_XE_OA_PROP_GLOBAL_SSEU, + + /** + * This optional parameter specifies the timer interval in nanoseconds + * at which the xe driver will check the OA buffer for available data. + * Minimum allowed value is 100 microseconds. A default value is used by + * the driver if this parameter is not specified. Note that larger timer + * values will reduce cpu consumption during OA perf captures. However, + * excessively large values would potentially result in OA buffer + * overwrites as captures reach end of the OA buffer. + * + * This property is available in perf revision 5. + */ + DRM_XE_OA_PROP_POLL_OA_PERIOD, + + /** + * Multiple engines may be mapped to the same OA unit. The OA unit is + * identified by class:instance of any engine mapped to it. + * + * This parameter specifies the engine class and must be passed along + * with DRM_XE_OA_PROP_OA_ENGINE_INSTANCE. + * + * This property is available in perf revision 6. + */ + DRM_XE_OA_PROP_OA_ENGINE_CLASS, + + /** + * This parameter specifies the engine instance and must be passed along + * with DRM_XE_OA_PROP_OA_ENGINE_CLASS. + * + * This property is available in perf revision 6. + */ + DRM_XE_OA_PROP_OA_ENGINE_INSTANCE, + + DRM_XE_OA_PROP_MAX /* non-ABI */ +}; + +struct drm_xe_oa_open_param { + __u32 flags; +#define XE_OA_FLAG_FD_CLOEXEC BIT(0) +#define XE_OA_FLAG_FD_NONBLOCK BIT(1) +#define XE_OA_FLAG_DISABLED BIT(2) + + /** The number of u64 (id, value) pairs */ + __u32 num_properties; + + /** + * Pointer to array of u64 (id, value) pairs configuring the stream + * to open. + */ + __u64 properties_ptr; +}; + +/* + * Enable data capture for a stream that was either opened in a disabled state + * via I915_PERF_FLAG_DISABLED or was later disabled via + * I915_PERF_IOCTL_DISABLE. + * + * It is intended to be cheaper to disable and enable a stream than it may be + * to close and re-open a stream with the same configuration. + * + * It's undefined whether any pending data for the stream will be lost. + * + * This ioctl is available in perf revision 1. + */ +#define XE_OA_IOCTL_ENABLE _IO('i', 0x0) + +/* + * Disable data capture for a stream. + * + * It is an error to try and read a stream that is disabled. + * + * This ioctl is available in perf revision 1. + */ +#define XE_OA_IOCTL_DISABLE _IO('i', 0x1) + +/* + * Change metrics_set captured by a stream. + * + * If the stream is bound to a specific context, the configuration change + * will performed inline with that context such that it takes effect before + * the next execbuf submission. + * + * Returns the previously bound metrics set id, or a negative error code. + * + * This ioctl is available in perf revision 2. + */ +#define XE_OA_IOCTL_CONFIG _IO('i', 0x2) + +struct drm_xe_oa_record_header { + __u32 type; + __u16 pad; + __u16 size; +}; + +enum drm_xe_oa_record_type { + + /** + * Samples are the work horse record type whose contents are extensible + * and defined when opening an i915 perf stream based on the given + * properties. + * + * Boolean properties following the naming convention + * DRM_I915_PERF_SAMPLE_xyz_PROP request the inclusion of 'xyz' data in + * every sample. + * + * The order of these sample properties given by userspace has no + * affect on the ordering of data within a sample. The order is + * documented here. + * + * struct { + * struct drm_i915_perf_record_header header; + * + * { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA + * }; + */ + DRM_XE_OA_RECORD_SAMPLE = 1, + + /* + * Indicates that one or more OA reports were not written by the + * hardware. This can happen for example if an MI_REPORT_PERF_COUNT + * command collides with periodic sampling - which would be more likely + * at higher sampling frequencies. + */ + DRM_XE_OA_RECORD_OA_REPORT_LOST = 2, + + /** + * An error occurred that resulted in all pending OA reports being lost. + */ + DRM_XE_OA_RECORD_OA_BUFFER_LOST = 3, + + DRM_XE_OA_RECORD_MAX /* non-ABI */ +}; + +struct drm_xe_oa_config { + /** + * @uuid: + * + * String formatted like "%\08x-%\04x-%\04x-%\04x-%\012x" + */ + char uuid[36]; + + /** + * @n_mux_regs: + * + * Number of mux regs in &mux_regs_ptr. + */ + __u32 n_mux_regs; + + /** + * @n_boolean_regs: + * + * Number of boolean regs in &boolean_regs_ptr. + */ + __u32 n_boolean_regs; + + /** + * @n_flex_regs: + * + * Number of flex regs in &flex_regs_ptr. + */ + __u32 n_flex_regs; + + /** + * @mux_regs_ptr: + * + * Pointer to tuples of u32 values (register address, value) for mux + * registers. Expected length of buffer is (2 * sizeof(u32) * + * &n_mux_regs). + */ + __u64 mux_regs_ptr; + + /** + * @boolean_regs_ptr: + * + * Pointer to tuples of u32 values (register address, value) for mux + * registers. Expected length of buffer is (2 * sizeof(u32) * + * &n_boolean_regs). + */ + __u64 boolean_regs_ptr; + + /** + * @flex_regs_ptr: + * + * Pointer to tuples of u32 values (register address, value) for mux + * registers. Expected length of buffer is (2 * sizeof(u32) * + * &n_flex_regs). + */ + __u64 flex_regs_ptr; +}; + #if defined(__cplusplus) } #endif