[07/16] drm/i915: flush periodic samples, in case of no pending CS sample requests

Submitted by sourab.gupta@intel.com on April 22, 2016, 11:33 a.m.

Details

Message ID 1461324845-25755-8-git-send-email-sourab.gupta@intel.com
State New
Headers show
Series "Framework to collect command stream gpu metrics using i915 perf" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

sourab.gupta@intel.com April 22, 2016, 11:33 a.m.
From: Sourab Gupta <sourab.gupta@intel.com>

When there are no pending CS OA samples, flush the periodic OA samples
collected so far.

We can safely forward the periodic OA samples in the case we
have no pending CS samples, but we can't do so in the case we have
pending CS samples, since we don't know what the ordering between
pending CS samples and periodic samples will eventually be. If we
have no pending CS sample, it won't be possible for future pending CS
sample to have timestamps earlier than current periodic timestamp.

Signed-off-by: Sourab Gupta <sourab.gupta@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h  |  14 ++--
 drivers/gpu/drm/i915/i915_perf.c | 173 +++++++++++++++++++++++++++++----------
 2 files changed, 140 insertions(+), 47 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 050df37..bc4fc1b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1760,7 +1760,7 @@  struct i915_perf_stream {
 	/* Return: true if any i915 perf records are ready to read()
 	 * for this stream.
 	 */
-	bool (*can_read)(struct i915_perf_stream *stream);
+	bool (*can_read_unlocked)(struct i915_perf_stream *stream);
 
 	/* Call poll_wait, passing a wait queue that will be woken
 	 * once there is something ready to read() for the stream
@@ -1772,8 +1772,8 @@  struct i915_perf_stream {
 	/* For handling a blocking read, wait until there is something
 	 * to ready to read() for the stream. E.g. wait on the same
 	 * wait queue that would be passed to poll_wait() until
-	 * ->can_read() returns true (if its safe to call ->can_read()
-	 * without the i915 perf lock held).
+	 * ->can_read_unlocked() returns true (if its safe to call
+	 * ->can_read_unlocked() without the i915 perf lock held).
 	 */
 	int (*wait_unlocked)(struct i915_perf_stream *stream);
 
@@ -1819,8 +1819,10 @@  struct i915_oa_ops {
 					u32 ctx_id);
 	void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req);
 	int (*read)(struct i915_perf_stream *stream,
-		    struct i915_perf_read_state *read_state, u32 ts);
-	bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
+		    struct i915_perf_read_state *read_state,
+		    u32 ts, u32 max_records);
+	int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv,
+					u32 *last_ts);
 };
 
 /*
@@ -2155,6 +2157,8 @@  struct drm_i915_private {
 			u32 gen7_latched_oastatus1;
 			u32 ctx_oactxctrl_off;
 			u32 ctx_flexeu0_off;
+			u32 n_pending_periodic_samples;
+			u32 pending_periodic_ts;
 
 			struct i915_oa_ops ops;
 			const struct i915_oa_format *oa_formats;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4adbf26..222de00 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -380,13 +380,30 @@  static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv)
  * pointers.  A race here could result in a false positive !empty status which
  * is acceptable.
  */
-static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv)
+static int
+gen8_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv,
+					u32 *last_ts)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
-	u32 head = I915_READ(GEN8_OAHEADPTR);
-	u32 tail = I915_READ(GEN8_OATAILPTR);
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr;
+	u32 head = I915_READ(GEN8_OAHEADPTR) & GEN8_OAHEADPTR_MASK;
+	u32 tail = I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	u32 num_samples;
+	u8 *report;
+
+	head -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	num_samples = OA_TAKEN(tail, head) / report_size;
 
-	return OA_TAKEN(tail, head) < report_size;
+	/* read the timestamp of the last sample */
+	if (num_samples) {
+		head += report_size*(num_samples - 1);
+		report = oa_buf_base + (head & mask);
+		*last_ts = *(u32 *)(report + 4);
+	}
+
+	return num_samples;
 }
 
 /* NB: This is either called via fops or the poll check hrtimer (atomic ctx)
@@ -400,15 +417,32 @@  static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_pr
  * pointers.  A race here could result in a false positive !empty status which
  * is acceptable.
  */
-static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv)
+static int
+gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv,
+					u32 *last_ts)
 {
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
 	u32 oastatus2 = I915_READ(GEN7_OASTATUS2);
 	u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
 	u32 head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK;
 	u32 tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
+	u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr;
+	u32 mask = (OA_BUFFER_SIZE - 1);
+	int available_size;
+	u32 num_samples = 0;
+	u8 *report;
 
-	return OA_TAKEN(tail, head) < dev_priv->perf.oa.tail_margin + report_size;
+	head -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
+	available_size = OA_TAKEN(tail, head) - dev_priv->perf.oa.tail_margin;
+	if (available_size >= report_size) {
+		num_samples = available_size / report_size;
+		head += report_size*(num_samples - 1);
+		report = oa_buf_base + (head & mask);
+		*last_ts = *(u32 *)(report + 4);
+	}
+
+	return num_samples;
 }
 
 /**
@@ -530,7 +564,7 @@  static int append_oa_buffer_sample(struct i915_perf_stream *stream,
 static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  struct i915_perf_read_state *read_state,
 				  u32 *head_ptr,
-				  u32 tail, u32 ts)
+				  u32 tail, u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -539,6 +573,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	u32 head;
 	u32 taken;
 	int ret = 0;
+	int n_records = 0;
 
 	head = *head_ptr - dev_priv->perf.oa.oa_buffer.gtt_offset;
 	tail -= dev_priv->perf.oa.oa_buffer.gtt_offset;
@@ -548,7 +583,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	 * use the above mask.
 	 */
 
-	while ((taken = OA_TAKEN(tail, head))) {
+	while ((taken = OA_TAKEN(tail, head) && (n_records <= max_records))) {
 		u8 *report;
 		u32 report_ts;
 
@@ -615,6 +650,7 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				if (ret)
 					break;
 
+				n_records++;
 				dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
 			}
 		}
@@ -629,7 +665,8 @@  static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 }
 
 static int gen8_oa_read(struct i915_perf_stream *stream,
-			struct i915_perf_read_state *read_state, u32 ts)
+			struct i915_perf_read_state *read_state,
+			u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -671,7 +708,8 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 
 	/* If there is still buffer space */
 
-	ret = gen8_append_oa_reports(stream, read_state, &head, tail, ts);
+	ret = gen8_append_oa_reports(stream, read_state, &head, tail,
+				     ts, max_records);
 
 	/* All the report sizes are a power of two and the
 	 * head should always be incremented by some multiple
@@ -702,7 +740,7 @@  static int gen8_oa_read(struct i915_perf_stream *stream,
 static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  struct i915_perf_read_state *read_state,
 				  u32 *head_ptr,
-				  u32 tail, u32 ts)
+				  u32 tail, u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -712,6 +750,7 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	u32 head;
 	u32 taken;
 	int ret = 0;
+	int n_records = 0;
 
 	BUG_ON(!stream->enabled);
 
@@ -749,7 +788,7 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	tail &= mask;
 
 	for (/* none */;
-	     (taken = OA_TAKEN(tail, head));
+	     (taken = OA_TAKEN(tail, head)) && (n_records <= max_records);
 	     head = (head + report_size) & mask)
 	{
 		u8 *report = oa_buf_base + head;
@@ -783,6 +822,7 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 		ret = append_oa_buffer_sample(stream, read_state, report);
 		if (ret)
 			break;
+		n_records++;
 
 		/* The above report-id field sanity check is based on
 		 * the assumption that the OA buffer is initially
@@ -799,7 +839,8 @@  static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 }
 
 static int gen7_oa_read(struct i915_perf_stream *stream,
-			struct i915_perf_read_state *read_state, u32 ts)
+			struct i915_perf_read_state *read_state,
+			u32 ts, u32 max_records)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -871,7 +912,8 @@  static int gen7_oa_read(struct i915_perf_stream *stream,
 			GEN7_OASTATUS1_REPORT_LOST;
 	}
 
-	ret = gen7_append_oa_reports(stream, read_state, &head, tail, ts);
+	ret = gen7_append_oa_reports(stream, read_state, &head, tail,
+				ts, max_records);
 
 	/* All the report sizes are a power of two and the
 	 * head should always be incremented by some multiple
@@ -917,7 +959,8 @@  static int append_oa_rcs_sample(struct i915_perf_stream *stream,
 
 	/* First, append the periodic OA samples having lower timestamps */
 	report_ts = *(u32 *)(report + 4);
-	ret = dev_priv->perf.oa.ops.read(stream, read_state, report_ts);
+	ret = dev_priv->perf.oa.ops.read(stream, read_state,
+					report_ts, U32_MAX);
 	if (ret)
 		return ret;
 
@@ -934,7 +977,8 @@  static int append_oa_rcs_sample(struct i915_perf_stream *stream,
 }
 
 /**
- * Copies all command stream based OA reports into userspace read() buffer.
+ * Copies all OA reports into userspace read() buffer. This includes command
+ * stream as well as periodic OA reports.
  *
  * NB: some data may be successfully copied to the userspace buffer
  * even if an error is returned, and this is reflected in the
@@ -951,7 +995,7 @@  static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_lock(&dev_priv->perf.node_list_lock);
 	if (list_empty(&dev_priv->perf.node_list)) {
 		spin_unlock(&dev_priv->perf.node_list_lock);
-		return 0;
+		goto pending_periodic;
 	}
 	list_for_each_entry_safe(entry, next,
 				 &dev_priv->perf.node_list, link) {
@@ -962,7 +1006,7 @@  static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_unlock(&dev_priv->perf.node_list_lock);
 
 	if (list_empty(&free_list))
-		return 0;
+		goto pending_periodic;
 
 	list_for_each_entry_safe(entry, next, &free_list, link) {
 		ret = append_oa_rcs_sample(stream, read_state, entry);
@@ -980,16 +1024,35 @@  static int oa_rcs_append_reports(struct i915_perf_stream *stream,
 	spin_unlock(&dev_priv->perf.node_list_lock);
 
 	return ret;
+
+pending_periodic:
+	if (!dev_priv->perf.oa.n_pending_periodic_samples)
+		return 0;
+
+	ret = dev_priv->perf.oa.ops.read(stream, read_state,
+				dev_priv->perf.oa.pending_periodic_ts,
+				dev_priv->perf.oa.n_pending_periodic_samples);
+	dev_priv->perf.oa.n_pending_periodic_samples = 0;
+	dev_priv->perf.oa.pending_periodic_ts = 0;
+	return ret;
 }
 
+enum cs_buf_data_state {
+	CS_BUF_EMPTY,
+	CS_BUF_REQ_PENDING,
+	CS_BUF_HAVE_DATA,
+};
+
 /*
  * Checks whether the command stream buffer associated with the stream has
  * data ready to be forwarded to userspace.
- * Returns true if atleast one request associated with command stream is
- * completed, else returns false.
+ * Value returned:
+ * CS_BUF_HAVE_DATA	- if there is atleast one completed request
+ * CS_BUF_REQ_PENDING	- there are requests pending, but no completed requests
+ * CS_BUF_EMPTY		- no requests scheduled
  */
-static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
-
+static enum cs_buf_data_state command_stream_buf_state(
+				struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	struct i915_perf_cs_data_node *entry = NULL;
@@ -1003,37 +1066,62 @@  static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
 	spin_unlock(&dev_priv->perf.node_list_lock);
 
 	if (!entry)
-		return true;
+		return CS_BUF_EMPTY;
 	else if (!i915_gem_request_completed(request, true))
-		return true;
+		return CS_BUF_REQ_PENDING;
 	else
-		return false;
+		return CS_BUF_HAVE_DATA;
 }
 
 /*
- * Checks whether the stream has data ready to forward to userspace.
- * For command stream based streams, check if the command stream buffer has
- * atleast one sample ready, if not return false, irrespective of periodic
- * oa buffer having the data or not.
+ * Checks whether the stream has data ready to forward to userspace, by
+ * querying for periodic oa buffer and command stream buffer samples.
  */
-
 static bool stream_have_data__unlocked(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
+	enum cs_buf_data_state cs_buf_state;
+	u32 num_samples, last_ts = 0;
 
-	/* Note: the oa_buffer_is_empty() condition is ok to run unlocked as it
-	 * just performs mmio reads of the OA buffer head + tail pointers and
+	/* Note: oa_buffer_num_samples() is ok to run unlocked as it just
+	 * performs mmio reads of the OA buffer head + tail pointers and
 	 * it's assumed we're handling some operation that implies the stream
 	 * can't be destroyed until completion (such as a read()) that ensures
 	 * the device + OA buffer can't disappear
 	 */
+	dev_priv->perf.oa.n_pending_periodic_samples = 0;
+	dev_priv->perf.oa.pending_periodic_ts = 0;
+	num_samples = dev_priv->perf.oa.ops.oa_buffer_num_samples(dev_priv,
+								&last_ts);
 	if (stream->cs_mode)
-		return !command_stream_buf_is_empty(stream);
+		cs_buf_state = command_stream_buf_state(stream);
 	else
-		return !dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv);
+		cs_buf_state = CS_BUF_EMPTY;
+
+	/*
+	 * Note: We can safely forward the periodic OA samples in the case we
+	 * have no pending CS samples, but we can't do so in the case we have
+	 * pending CS samples, since we don't know what the ordering between
+	 * pending CS samples and periodic samples will eventually be. If we
+	 * have no pending CS sample, it won't be possible for future pending CS
+	 * sample to have timestamps earlier than current periodic timestamp.
+	 */
+	switch (cs_buf_state) {
+	case CS_BUF_EMPTY:
+		dev_priv->perf.oa.n_pending_periodic_samples = num_samples;
+		dev_priv->perf.oa.pending_periodic_ts = last_ts;
+		return (num_samples != 0);
+
+	case CS_BUF_HAVE_DATA:
+		return true;
+
+	case CS_BUF_REQ_PENDING:
+		default:
+		return false;
+	}
 }
 
-static bool i915_oa_can_read(struct i915_perf_stream *stream)
+static bool i915_oa_can_read_unlocked(struct i915_perf_stream *stream)
 {
 
 	return stream_have_data__unlocked(stream);
@@ -1071,7 +1159,8 @@  static int i915_oa_read(struct i915_perf_stream *stream,
 	if (stream->cs_mode)
 		return oa_rcs_append_reports(stream, read_state);
 	else
-		return dev_priv->perf.oa.ops.read(stream, read_state, U32_MAX);
+		return dev_priv->perf.oa.ops.read(stream, read_state,
+						U32_MAX, U32_MAX);
 }
 
 static void
@@ -1811,7 +1900,7 @@  static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	stream->destroy = i915_oa_stream_destroy;
 	stream->enable = i915_oa_stream_enable;
 	stream->disable = i915_oa_stream_disable;
-	stream->can_read = i915_oa_can_read;
+	stream->can_read_unlocked = i915_oa_can_read_unlocked;
 	stream->wait_unlocked = i915_oa_wait_unlocked;
 	stream->poll_wait = i915_oa_poll_wait;
 	stream->read = i915_oa_read;
@@ -2101,7 +2190,7 @@  static unsigned int i915_perf_poll_locked(struct i915_perf_stream *stream,
 
 	stream->poll_wait(stream, file, wait);
 
-	if (stream->can_read(stream))
+	if (stream->can_read_unlocked(stream))
 		streams |= POLLIN;
 
 	return streams;
@@ -2582,8 +2671,8 @@  void i915_perf_init(struct drm_device *dev)
 		dev_priv->perf.oa.ops.update_hw_ctx_id_locked =
 						gen7_update_hw_ctx_id_locked;
 		dev_priv->perf.oa.ops.read = gen7_oa_read;
-		dev_priv->perf.oa.ops.oa_buffer_is_empty =
-			gen7_oa_buffer_is_empty_fop_unlocked;
+		dev_priv->perf.oa.ops.oa_buffer_num_samples =
+			gen7_oa_buffer_num_samples_fop_unlocked;
 
 		dev_priv->perf.oa.oa_formats = hsw_oa_formats;
 
@@ -2597,8 +2686,8 @@  void i915_perf_init(struct drm_device *dev)
 		dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
 		dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
 		dev_priv->perf.oa.ops.read = gen8_oa_read;
-		dev_priv->perf.oa.ops.oa_buffer_is_empty =
-			gen8_oa_buffer_is_empty_fop_unlocked;
+		dev_priv->perf.oa.ops.oa_buffer_num_samples =
+				gen8_oa_buffer_num_samples_fop_unlocked;
 
 		dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;