[v2,3/3] drm/vc4: Add a load tracker to prevent HVS underflow errors

Submitted by Boris Brezillon on Oct. 25, 2018, 12:45 p.m.

Details

Message ID 20181025124546.22145-4-boris.brezillon@bootlin.com
State New
Headers show
Series "drm/vc4: Add a load tracker" ( rev: 1 ) in DRI devel

Browsing this patch as part of:
"drm/vc4: Add a load tracker" rev 1 in DRI devel
<< prev patch [3/3] next patch >>

Commit Message

Boris Brezillon Oct. 25, 2018, 12:45 p.m.
The HVS block is supposed to fill the pixelvalve FIFOs fast enough to
meet the requested framerate. The problem is, the HVS and memory bus
bandwidths are limited, and if we don't take these limitations into
account we might end up with HVS underflow errors.

This patch is trying to model the per-plane HVS and memory bus bandwidth
consumption and take a decision at atomic_check() time whether the
estimated load will fit in the HVS and membus budget.

Note that we take an extra margin on the memory bus consumption to let
the system run smoothly when other blocks are doing heavy use of the
memory bus. Same goes for the HVS limit, except the margin is smaller in
this case, since the HVS is not used by external components.

Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
Changes in v2:
- Remove an unused var in vc4_load_tracker_atomic_check()
---
 drivers/gpu/drm/vc4/vc4_drv.c   |   1 +
 drivers/gpu/drm/vc4/vc4_drv.h   |  11 ++++
 drivers/gpu/drm/vc4/vc4_kms.c   | 103 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_plane.c |  60 +++++++++++++++++++
 4 files changed, 174 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index f6f5cd80c04d..7195a0bcceb3 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -313,6 +313,7 @@  static void vc4_drm_unbind(struct device *dev)
 
 	drm_mode_config_cleanup(drm);
 
+	drm_atomic_private_obj_fini(&vc4->load_tracker);
 	drm_atomic_private_obj_fini(&vc4->ctm_manager);
 
 	drm_dev_put(drm);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index a6c67c65ffbc..11369da944b6 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -200,6 +200,7 @@  struct vc4_dev {
 
 	struct drm_modeset_lock ctm_state_lock;
 	struct drm_private_obj ctm_manager;
+	struct drm_private_obj load_tracker;
 };
 
 static inline struct vc4_dev *
@@ -369,6 +370,16 @@  struct vc4_plane_state {
 	 * to enable background color fill.
 	 */
 	bool needs_bg_fill;
+
+	/* Load of this plane on the HVS block. The load is expressed in HVS
+	 * cycles/sec.
+	 */
+	u64 hvs_load;
+
+	/* Memory bandwidth needed for this plane. This is expressed in
+	 * bytes/sec.
+	 */
+	u64 membus_load;
 };
 
 static inline struct vc4_plane_state *
diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c
index 8e0183b1e8bb..b905a19c1470 100644
--- a/drivers/gpu/drm/vc4/vc4_kms.c
+++ b/drivers/gpu/drm/vc4/vc4_kms.c
@@ -34,6 +34,18 @@  static struct vc4_ctm_state *to_vc4_ctm_state(struct drm_private_state *priv)
 	return container_of(priv, struct vc4_ctm_state, base);
 }
 
+struct vc4_load_tracker_state {
+	struct drm_private_state base;
+	u64 hvs_load;
+	u64 membus_load;
+};
+
+static struct vc4_load_tracker_state *
+to_vc4_load_tracker_state(struct drm_private_state *priv)
+{
+	return container_of(priv, struct vc4_load_tracker_state, base);
+}
+
 static struct vc4_ctm_state *vc4_get_ctm_state(struct drm_atomic_state *state,
 					       struct drm_private_obj *manager)
 {
@@ -383,6 +395,81 @@  vc4_ctm_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
 	return 0;
 }
 
+static int vc4_load_tracker_atomic_check(struct drm_atomic_state *state)
+{
+	struct drm_plane_state *old_plane_state, *new_plane_state;
+	struct vc4_dev *vc4 = to_vc4_dev(state->dev);
+	struct vc4_load_tracker_state *load_state;
+	struct drm_private_state *priv_state;
+	struct drm_plane *plane;
+	int i;
+
+	priv_state = drm_atomic_get_private_obj_state(state,
+						      &vc4->load_tracker);
+	if (IS_ERR(priv_state))
+		return PTR_ERR(priv_state);
+
+	load_state = to_vc4_load_tracker_state(priv_state);
+	for_each_oldnew_plane_in_state(state, plane, old_plane_state,
+				       new_plane_state, i) {
+		struct vc4_plane_state *vc4_plane_state;
+
+		if (old_plane_state->fb && old_plane_state->crtc) {
+			vc4_plane_state = to_vc4_plane_state(old_plane_state);
+			load_state->membus_load -= vc4_plane_state->membus_load;
+			load_state->hvs_load -= vc4_plane_state->hvs_load;
+		}
+
+		if (new_plane_state->fb && new_plane_state->crtc) {
+			vc4_plane_state = to_vc4_plane_state(new_plane_state);
+			load_state->membus_load += vc4_plane_state->membus_load;
+			load_state->hvs_load += vc4_plane_state->hvs_load;
+		}
+	}
+
+	/* The abolsute limit is 2Gbyte/sec, but let's take a margin to let
+	 * the system work when other blocks are accessing the memory.
+	 */
+	if (load_state->membus_load > SZ_1G + SZ_512M)
+		return -ENOSPC;
+
+	/* HVS clock is supposed to run @ 250Mhz, let's take a margin and
+	 * consider the maximum number of cycles is 240M.
+	 */
+	if (load_state->hvs_load > 240000000ULL)
+		return -ENOSPC;
+
+	return 0;
+}
+
+static struct drm_private_state *
+vc4_load_tracker_duplicate_state(struct drm_private_obj *obj)
+{
+	struct vc4_load_tracker_state *state;
+
+	state = kmemdup(obj->state, sizeof(*state), GFP_KERNEL);
+	if (!state)
+		return NULL;
+
+	__drm_atomic_helper_private_obj_duplicate_state(obj, &state->base);
+
+	return &state->base;
+}
+
+static void vc4_load_tracker_destroy_state(struct drm_private_obj *obj,
+					   struct drm_private_state *state)
+{
+	struct vc4_load_tracker_state *load_state;
+
+	load_state = to_vc4_load_tracker_state(state);
+	kfree(load_state);
+}
+
+static const struct drm_private_state_funcs vc4_load_tracker_state_funcs = {
+	.atomic_duplicate_state = vc4_load_tracker_duplicate_state,
+	.atomic_destroy_state = vc4_load_tracker_destroy_state,
+};
+
 static int
 vc4_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
 {
@@ -392,7 +479,11 @@  vc4_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
 	if (ret < 0)
 		return ret;
 
-	return drm_atomic_helper_check(dev, state);
+	ret = drm_atomic_helper_check(dev, state);
+	if (ret)
+		return ret;
+
+	return vc4_load_tracker_atomic_check(state);
 }
 
 static const struct drm_mode_config_funcs vc4_mode_funcs = {
@@ -408,6 +499,7 @@  int vc4_kms_load(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	struct vc4_ctm_state *ctm_state;
+	struct vc4_load_tracker_state *load_state;
 	int ret;
 
 	sema_init(&vc4->async_modeset, 1);
@@ -437,6 +529,15 @@  int vc4_kms_load(struct drm_device *dev)
 	drm_atomic_private_obj_init(dev, &vc4->ctm_manager, &ctm_state->base,
 				    &vc4_ctm_state_funcs);
 
+	load_state = kzalloc(sizeof(*load_state), GFP_KERNEL);
+	if (!load_state) {
+		drm_atomic_private_obj_fini(&vc4->ctm_manager);
+		return -ENOMEM;
+	}
+
+	drm_atomic_private_obj_init(dev, &vc4->load_tracker, &load_state->base,
+				    &vc4_load_tracker_state_funcs);
+
 	drm_mode_config_reset(dev);
 
 	drm_kms_helper_poll_init(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 0f294d612769..5e9687406517 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -450,6 +450,64 @@  static void vc4_write_scaling_parameters(struct drm_plane_state *state,
 	}
 }
 
+static void vc4_plane_calc_load(struct drm_plane_state *state)
+{
+	unsigned int hvs_load_shift, vrefresh, i;
+	struct drm_framebuffer *fb = state->fb;
+	struct vc4_plane_state *vc4_state;
+	struct drm_crtc_state *crtc_state;
+	unsigned int vscale_factor;
+
+	vc4_state = to_vc4_plane_state(state);
+	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
+							state->crtc);
+	vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
+
+	/* The HVS is able to process 2 pixels/cycle when scaling the source,
+	 * 4 pixels/cycle otherwise.
+	 * Alpha blending step seems to be pipelined and it's always operating
+	 * at 4 pixels/cycle, so the limiting aspect here seems to be the
+	 * scaler block.
+	 * HVS load is expressed in clk-cycles/sec (AKA Hz).
+	 */
+	if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
+	    vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
+	    vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
+	    vc4_state->y_scaling[1] != VC4_SCALING_NONE)
+		hvs_load_shift = 1;
+	else
+		hvs_load_shift = 2;
+
+	vc4_state->membus_load = 0;
+	vc4_state->hvs_load = 0;
+	for (i = 0; i < fb->format->num_planes; i++) {
+		unsigned long pixels_load;
+
+		/* Even if the bandwidth/plane required for a single frame is
+		 *
+		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
+		 *
+		 * when downscaling, we have to read more pixels per line in
+		 * the time frame reserved for a single line, so the bandwidth
+		 * demand can be punctually higher. To account for that, we
+		 * calculate the down-scaling factor and multiply the plane
+		 * load by this number. We're likely over-estimating the read
+		 * demand, but that's better than under-estimating it.
+		 */
+		vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
+					     vc4_state->crtc_h);
+		pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] *
+			      vscale_factor;
+
+		vc4_state->membus_load += fb->format->cpp[i] * pixels_load;
+		vc4_state->hvs_load += pixels_load;
+	}
+
+	vc4_state->hvs_load *= vrefresh;
+	vc4_state->hvs_load >>= hvs_load_shift;
+	vc4_state->membus_load *= vrefresh;
+}
+
 /* Writes out a full display list for an active plane to the plane's
  * private dlist state.
  */
@@ -769,6 +827,8 @@  static int vc4_plane_mode_set(struct drm_plane *plane,
 	vc4_state->needs_bg_fill = fb->format->has_alpha || !covers_screen ||
 				   state->alpha != DRM_BLEND_ALPHA_OPAQUE;
 
+	vc4_plane_calc_load(state);
+
 	return 0;
 }
 

Comments

Boris Brezillon <boris.brezillon@bootlin.com> writes:

> The HVS block is supposed to fill the pixelvalve FIFOs fast enough to
> meet the requested framerate. The problem is, the HVS and memory bus
> bandwidths are limited, and if we don't take these limitations into
> account we might end up with HVS underflow errors.
>
> This patch is trying to model the per-plane HVS and memory bus bandwidth
> consumption and take a decision at atomic_check() time whether the
> estimated load will fit in the HVS and membus budget.
>
> Note that we take an extra margin on the memory bus consumption to let
> the system run smoothly when other blocks are doing heavy use of the
> memory bus. Same goes for the HVS limit, except the margin is smaller in
> this case, since the HVS is not used by external components.
>
> Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
> ---
> Changes in v2:
> - Remove an unused var in vc4_load_tracker_atomic_check()
> ---
>  drivers/gpu/drm/vc4/vc4_drv.c   |   1 +
>  drivers/gpu/drm/vc4/vc4_drv.h   |  11 ++++
>  drivers/gpu/drm/vc4/vc4_kms.c   | 103 +++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/vc4/vc4_plane.c |  60 +++++++++++++++++++
>  4 files changed, 174 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
> index f6f5cd80c04d..7195a0bcceb3 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.c
> +++ b/drivers/gpu/drm/vc4/vc4_drv.c
> @@ -313,6 +313,7 @@ static void vc4_drm_unbind(struct device *dev)
>  
>  	drm_mode_config_cleanup(drm);
>  
> +	drm_atomic_private_obj_fini(&vc4->load_tracker);
>  	drm_atomic_private_obj_fini(&vc4->ctm_manager);
>  
>  	drm_dev_put(drm);
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index a6c67c65ffbc..11369da944b6 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -200,6 +200,7 @@ struct vc4_dev {
>  
>  	struct drm_modeset_lock ctm_state_lock;
>  	struct drm_private_obj ctm_manager;
> +	struct drm_private_obj load_tracker;
>  };
>  
>  static inline struct vc4_dev *
> @@ -369,6 +370,16 @@ struct vc4_plane_state {
>  	 * to enable background color fill.
>  	 */
>  	bool needs_bg_fill;
> +
> +	/* Load of this plane on the HVS block. The load is expressed in HVS
> +	 * cycles/sec.
> +	 */
> +	u64 hvs_load;
> +
> +	/* Memory bandwidth needed for this plane. This is expressed in
> +	 * bytes/sec.
> +	 */
> +	u64 membus_load;
>  };
>  
>  static inline struct vc4_plane_state *
> diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c
> index 8e0183b1e8bb..b905a19c1470 100644
> --- a/drivers/gpu/drm/vc4/vc4_kms.c
> +++ b/drivers/gpu/drm/vc4/vc4_kms.c
> @@ -34,6 +34,18 @@ static struct vc4_ctm_state *to_vc4_ctm_state(struct drm_private_state *priv)
>  	return container_of(priv, struct vc4_ctm_state, base);
>  }
>  
> +struct vc4_load_tracker_state {
> +	struct drm_private_state base;
> +	u64 hvs_load;
> +	u64 membus_load;
> +};
> +
> +static struct vc4_load_tracker_state *
> +to_vc4_load_tracker_state(struct drm_private_state *priv)
> +{
> +	return container_of(priv, struct vc4_load_tracker_state, base);
> +}
> +
>  static struct vc4_ctm_state *vc4_get_ctm_state(struct drm_atomic_state *state,
>  					       struct drm_private_obj *manager)
>  {
> @@ -383,6 +395,81 @@ vc4_ctm_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
>  	return 0;
>  }
>  
> +static int vc4_load_tracker_atomic_check(struct drm_atomic_state *state)
> +{
> +	struct drm_plane_state *old_plane_state, *new_plane_state;
> +	struct vc4_dev *vc4 = to_vc4_dev(state->dev);
> +	struct vc4_load_tracker_state *load_state;
> +	struct drm_private_state *priv_state;
> +	struct drm_plane *plane;
> +	int i;
> +
> +	priv_state = drm_atomic_get_private_obj_state(state,
> +						      &vc4->load_tracker);
> +	if (IS_ERR(priv_state))
> +		return PTR_ERR(priv_state);
> +
> +	load_state = to_vc4_load_tracker_state(priv_state);
> +	for_each_oldnew_plane_in_state(state, plane, old_plane_state,
> +				       new_plane_state, i) {
> +		struct vc4_plane_state *vc4_plane_state;
> +
> +		if (old_plane_state->fb && old_plane_state->crtc) {
> +			vc4_plane_state = to_vc4_plane_state(old_plane_state);
> +			load_state->membus_load -= vc4_plane_state->membus_load;
> +			load_state->hvs_load -= vc4_plane_state->hvs_load;
> +		}
> +
> +		if (new_plane_state->fb && new_plane_state->crtc) {
> +			vc4_plane_state = to_vc4_plane_state(new_plane_state);
> +			load_state->membus_load += vc4_plane_state->membus_load;
> +			load_state->hvs_load += vc4_plane_state->hvs_load;
> +		}
> +	}
> +
> +	/* The abolsute limit is 2Gbyte/sec, but let's take a margin to let

"absolute"

> +	 * the system work when other blocks are accessing the memory.
> +	 */
> +	if (load_state->membus_load > SZ_1G + SZ_512M)
> +		return -ENOSPC;


> +	/* HVS clock is supposed to run @ 250Mhz, let's take a margin and
> +	 * consider the maximum number of cycles is 240M.
> +	 */
> +	if (load_state->hvs_load > 240000000ULL)
> +		return -ENOSPC;

Some day we should probably be using the HVS's actual clock from DT if
available instead of a hardcoded number here.  Good enough for now.

> +static void vc4_load_tracker_destroy_state(struct drm_private_obj *obj,
> +					   struct drm_private_state *state)
> +{
> +	struct vc4_load_tracker_state *load_state;
> +
> +	load_state = to_vc4_load_tracker_state(state);
> +	kfree(load_state);
> +}

Optional: just kfree(state) for simplicity.

> +static void vc4_plane_calc_load(struct drm_plane_state *state)
> +{
> +	unsigned int hvs_load_shift, vrefresh, i;
> +	struct drm_framebuffer *fb = state->fb;
> +	struct vc4_plane_state *vc4_state;
> +	struct drm_crtc_state *crtc_state;
> +	unsigned int vscale_factor;
> +
> +	vc4_state = to_vc4_plane_state(state);
> +	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
> +							state->crtc);
> +	vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
> +
> +	/* The HVS is able to process 2 pixels/cycle when scaling the source,
> +	 * 4 pixels/cycle otherwise.
> +	 * Alpha blending step seems to be pipelined and it's always operating
> +	 * at 4 pixels/cycle, so the limiting aspect here seems to be the
> +	 * scaler block.
> +	 * HVS load is expressed in clk-cycles/sec (AKA Hz).
> +	 */
> +	if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
> +	    vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
> +	    vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
> +	    vc4_state->y_scaling[1] != VC4_SCALING_NONE)
> +		hvs_load_shift = 1;
> +	else
> +		hvs_load_shift = 2;
> +
> +	vc4_state->membus_load = 0;
> +	vc4_state->hvs_load = 0;
> +	for (i = 0; i < fb->format->num_planes; i++) {
> +		unsigned long pixels_load;

I'm scared any time I see longs.  Do you want 32 or 64 bits here?

> +		/* Even if the bandwidth/plane required for a single frame is
> +		 *
> +		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
> +		 *
> +		 * when downscaling, we have to read more pixels per line in
> +		 * the time frame reserved for a single line, so the bandwidth
> +		 * demand can be punctually higher. To account for that, we
> +		 * calculate the down-scaling factor and multiply the plane
> +		 * load by this number. We're likely over-estimating the read
> +		 * demand, but that's better than under-estimating it.
> +		 */
> +		vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
> +					     vc4_state->crtc_h);
> +		pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] *
> +			      vscale_factor;

If we're upscaling (common for video, right?), aren't we under-counting
the cost?  You need to scale/colorspace-convert crtc_w * crtc_h at 2
pixels per cycle.

Overall, this is simpler than I expected it to be, and looks really
promising.  Thanks for working on it!

> +		vc4_state->membus_load += fb->format->cpp[i] * pixels_load;
> +		vc4_state->hvs_load += pixels_load;
> +	}
> +
> +	vc4_state->hvs_load *= vrefresh;
> +	vc4_state->hvs_load >>= hvs_load_shift;
> +	vc4_state->membus_load *= vrefresh;
> +}
> +
Hi Eric,

On Tue, 30 Oct 2018 16:12:55 -0700
Eric Anholt <eric@anholt.net> wrote:

> > +
> > +	/* The abolsute limit is 2Gbyte/sec, but let's take a margin to let  
> 
> "absolute"

Will fix the typo.

> 
> > +	 * the system work when other blocks are accessing the memory.
> > +	 */
> > +	if (load_state->membus_load > SZ_1G + SZ_512M)
> > +		return -ENOSPC;  
> 
> 
> > +	/* HVS clock is supposed to run @ 250Mhz, let's take a margin and
> > +	 * consider the maximum number of cycles is 240M.
> > +	 */
> > +	if (load_state->hvs_load > 240000000ULL)
> > +		return -ENOSPC;  
> 
> Some day we should probably be using the HVS's actual clock from DT if
> available instead of a hardcoded number here.

I agree.

> Good enough for now.
> 
> > +static void vc4_load_tracker_destroy_state(struct drm_private_obj *obj,
> > +					   struct drm_private_state *state)
> > +{
> > +	struct vc4_load_tracker_state *load_state;
> > +
> > +	load_state = to_vc4_load_tracker_state(state);
> > +	kfree(load_state);
> > +}  
> 
> Optional: just kfree(state) for simplicity.

Hm, not sure that's a good idea. kfree(state) works as long as
drm_private_state is the first field in vc4_load_tracker_state, but it
sounds a bit fragile.

I can do

	kfree(to_vc4_load_tracker_state(state));

if you prefer.

> 
> > +static void vc4_plane_calc_load(struct drm_plane_state *state)
> > +{
> > +	unsigned int hvs_load_shift, vrefresh, i;
> > +	struct drm_framebuffer *fb = state->fb;
> > +	struct vc4_plane_state *vc4_state;
> > +	struct drm_crtc_state *crtc_state;
> > +	unsigned int vscale_factor;
> > +
> > +	vc4_state = to_vc4_plane_state(state);
> > +	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
> > +							state->crtc);
> > +	vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
> > +
> > +	/* The HVS is able to process 2 pixels/cycle when scaling the source,
> > +	 * 4 pixels/cycle otherwise.
> > +	 * Alpha blending step seems to be pipelined and it's always operating
> > +	 * at 4 pixels/cycle, so the limiting aspect here seems to be the
> > +	 * scaler block.
> > +	 * HVS load is expressed in clk-cycles/sec (AKA Hz).
> > +	 */
> > +	if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
> > +	    vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
> > +	    vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
> > +	    vc4_state->y_scaling[1] != VC4_SCALING_NONE)
> > +		hvs_load_shift = 1;
> > +	else
> > +		hvs_load_shift = 2;
> > +
> > +	vc4_state->membus_load = 0;
> > +	vc4_state->hvs_load = 0;
> > +	for (i = 0; i < fb->format->num_planes; i++) {
> > +		unsigned long pixels_load;  
> 
> I'm scared any time I see longs.  Do you want 32 or 64 bits here?

I just assumed a 32bit unsigned var would be enough, so unsigned long
seemed just fine. I can use u32 or u64 if you prefer.

> 
> > +		/* Even if the bandwidth/plane required for a single frame is
> > +		 *
> > +		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
> > +		 *
> > +		 * when downscaling, we have to read more pixels per line in
> > +		 * the time frame reserved for a single line, so the bandwidth
> > +		 * demand can be punctually higher. To account for that, we
> > +		 * calculate the down-scaling factor and multiply the plane
> > +		 * load by this number. We're likely over-estimating the read
> > +		 * demand, but that's better than under-estimating it.
> > +		 */
> > +		vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
> > +					     vc4_state->crtc_h);
> > +		pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] *
> > +			      vscale_factor;  
> 
> If we're upscaling (common for video, right?), aren't we under-counting
> the cost?  You need to scale/colorspace-convert crtc_w * crtc_h at 2
> pixels per cycle.

That's not entirely clear to me. I'm not sure what the scaler does when
upscaling. Are the same pixels read several times from the memory? If
that's the case, then the membus load should indeed be based on the
crtc_w,h.

Also, when the spec says the HVS can process 4pixels/cycles, is it 4
input pixels or 4 output pixels per cycle?

Regards,

Boris
On Tue, 30 Oct 2018 16:12:55 -0700
Eric Anholt <eric@anholt.net> wrote:

> > +		/* Even if the bandwidth/plane required for a single frame is
> > +		 *
> > +		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
> > +		 *
> > +		 * when downscaling, we have to read more pixels per line in
> > +		 * the time frame reserved for a single line, so the bandwidth
> > +		 * demand can be punctually higher. To account for that, we
> > +		 * calculate the down-scaling factor and multiply the plane
> > +		 * load by this number. We're likely over-estimating the read
> > +		 * demand, but that's better than under-estimating it.
> > +		 */

One more thing: because we sometimes over estimate the load, we might
reject configuration that were previously accepted, so I'm wondering if
we shouldn't make the 'reject on HVS/membus load too high' optional,
and disable it by default.
Boris Brezillon <boris.brezillon@bootlin.com> writes:

> Hi Eric,
>
> On Tue, 30 Oct 2018 16:12:55 -0700
> Eric Anholt <eric@anholt.net> wrote:
>> > +static void vc4_load_tracker_destroy_state(struct drm_private_obj *obj,
>> > +					   struct drm_private_state *state)
>> > +{
>> > +	struct vc4_load_tracker_state *load_state;
>> > +
>> > +	load_state = to_vc4_load_tracker_state(state);
>> > +	kfree(load_state);
>> > +}  
>> 
>> Optional: just kfree(state) for simplicity.
>
> Hm, not sure that's a good idea. kfree(state) works as long as
> drm_private_state is the first field in vc4_load_tracker_state, but it
> sounds a bit fragile.
>
> I can do
>
> 	kfree(to_vc4_load_tracker_state(state));
>
> if you prefer.

I said optional for that reason :)  Just keep it as is.

>> > +static void vc4_plane_calc_load(struct drm_plane_state *state)
>> > +{
>> > +	unsigned int hvs_load_shift, vrefresh, i;
>> > +	struct drm_framebuffer *fb = state->fb;
>> > +	struct vc4_plane_state *vc4_state;
>> > +	struct drm_crtc_state *crtc_state;
>> > +	unsigned int vscale_factor;
>> > +
>> > +	vc4_state = to_vc4_plane_state(state);
>> > +	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
>> > +							state->crtc);
>> > +	vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
>> > +
>> > +	/* The HVS is able to process 2 pixels/cycle when scaling the source,
>> > +	 * 4 pixels/cycle otherwise.
>> > +	 * Alpha blending step seems to be pipelined and it's always operating
>> > +	 * at 4 pixels/cycle, so the limiting aspect here seems to be the
>> > +	 * scaler block.
>> > +	 * HVS load is expressed in clk-cycles/sec (AKA Hz).
>> > +	 */
>> > +	if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
>> > +	    vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
>> > +	    vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
>> > +	    vc4_state->y_scaling[1] != VC4_SCALING_NONE)
>> > +		hvs_load_shift = 1;
>> > +	else
>> > +		hvs_load_shift = 2;
>> > +
>> > +	vc4_state->membus_load = 0;
>> > +	vc4_state->hvs_load = 0;
>> > +	for (i = 0; i < fb->format->num_planes; i++) {
>> > +		unsigned long pixels_load;  
>> 
>> I'm scared any time I see longs.  Do you want 32 or 64 bits here?
>
> I just assumed a 32bit unsigned var would be enough, so unsigned long
> seemed just fine. I can use u32 or u64 if you prefer.

Yes, please.  See also Maxime's recent trouble with a 64-bit kernel.

>> > +		/* Even if the bandwidth/plane required for a single frame is
>> > +		 *
>> > +		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
>> > +		 *
>> > +		 * when downscaling, we have to read more pixels per line in
>> > +		 * the time frame reserved for a single line, so the bandwidth
>> > +		 * demand can be punctually higher. To account for that, we
>> > +		 * calculate the down-scaling factor and multiply the plane
>> > +		 * load by this number. We're likely over-estimating the read
>> > +		 * demand, but that's better than under-estimating it.
>> > +		 */
>> > +		vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
>> > +					     vc4_state->crtc_h);
>> > +		pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] *
>> > +			      vscale_factor;  
>> 
>> If we're upscaling (common for video, right?), aren't we under-counting
>> the cost?  You need to scale/colorspace-convert crtc_w * crtc_h at 2
>> pixels per cycle.
>
> That's not entirely clear to me. I'm not sure what the scaler does when
> upscaling. Are the same pixels read several times from the memory? If
> that's the case, then the membus load should indeed be based on the
> crtc_w,h.

I'm going to punt on this question because that would be a *lot* of
verilog tracing to figure out for me (and I'm not sure I'd even trust
what I came up with).

> Also, when the spec says the HVS can process 4pixels/cycles, is it 4
> input pixels or 4 output pixels per cycle?

Well, it's 4 pixels per cycle when not scaling, so both :)

I think the scaling pipeline is doing two output pixels per cycle.
Nothing else would make sense to me.
On Thu, 08 Nov 2018 08:26:49 -0800
Eric Anholt <eric@anholt.net> wrote:

> >> > +static void vc4_plane_calc_load(struct drm_plane_state *state)
> >> > +{
> >> > +	unsigned int hvs_load_shift, vrefresh, i;
> >> > +	struct drm_framebuffer *fb = state->fb;
> >> > +	struct vc4_plane_state *vc4_state;
> >> > +	struct drm_crtc_state *crtc_state;
> >> > +	unsigned int vscale_factor;
> >> > +
> >> > +	vc4_state = to_vc4_plane_state(state);
> >> > +	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
> >> > +							state->crtc);
> >> > +	vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
> >> > +
> >> > +	/* The HVS is able to process 2 pixels/cycle when scaling the source,
> >> > +	 * 4 pixels/cycle otherwise.
> >> > +	 * Alpha blending step seems to be pipelined and it's always operating
> >> > +	 * at 4 pixels/cycle, so the limiting aspect here seems to be the
> >> > +	 * scaler block.
> >> > +	 * HVS load is expressed in clk-cycles/sec (AKA Hz).
> >> > +	 */
> >> > +	if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
> >> > +	    vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
> >> > +	    vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
> >> > +	    vc4_state->y_scaling[1] != VC4_SCALING_NONE)
> >> > +		hvs_load_shift = 1;
> >> > +	else
> >> > +		hvs_load_shift = 2;
> >> > +
> >> > +	vc4_state->membus_load = 0;
> >> > +	vc4_state->hvs_load = 0;
> >> > +	for (i = 0; i < fb->format->num_planes; i++) {
> >> > +		unsigned long pixels_load;    
> >> 
> >> I'm scared any time I see longs.  Do you want 32 or 64 bits here?  
> >
> > I just assumed a 32bit unsigned var would be enough, so unsigned long
> > seemed just fine. I can use u32 or u64 if you prefer.  
> 
> Yes, please.  See also Maxime's recent trouble with a 64-bit kernel.

Will use u32 then.

> 
> >> > +		/* Even if the bandwidth/plane required for a single frame is
> >> > +		 *
> >> > +		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
> >> > +		 *
> >> > +		 * when downscaling, we have to read more pixels per line in
> >> > +		 * the time frame reserved for a single line, so the bandwidth
> >> > +		 * demand can be punctually higher. To account for that, we
> >> > +		 * calculate the down-scaling factor and multiply the plane
> >> > +		 * load by this number. We're likely over-estimating the read
> >> > +		 * demand, but that's better than under-estimating it.
> >> > +		 */
> >> > +		vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
> >> > +					     vc4_state->crtc_h);
> >> > +		pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] *
> >> > +			      vscale_factor;    
> >> 
> >> If we're upscaling (common for video, right?), aren't we under-counting
> >> the cost?  You need to scale/colorspace-convert crtc_w * crtc_h at 2
> >> pixels per cycle.  
> >
> > That's not entirely clear to me. I'm not sure what the scaler does when
> > upscaling. Are the same pixels read several times from the memory? If
> > that's the case, then the membus load should indeed be based on the
> > crtc_w,h.  
> 
> I'm going to punt on this question because that would be a *lot* of
> verilog tracing to figure out for me (and I'm not sure I'd even trust
> what I came up with).
> 
> > Also, when the spec says the HVS can process 4pixels/cycles, is it 4
> > input pixels or 4 output pixels per cycle?  
> 
> Well, it's 4 pixels per cycle when not scaling, so both :)

Sorry, I meant 2pixels/cycle :).

> 
> I think the scaling pipeline is doing two output pixels per cycle.
> Nothing else would make sense to me.

Okay, so the HVS load should be based on crtc_w/h and the membus load
should be based on src_w/h and the scaling ratio, right?
Boris Brezillon <boris.brezillon@bootlin.com> writes:

> On Thu, 08 Nov 2018 08:26:49 -0800
> Eric Anholt <eric@anholt.net> wrote:
>
>> >> > +static void vc4_plane_calc_load(struct drm_plane_state *state)
>> >> > +{
>> >> > +	unsigned int hvs_load_shift, vrefresh, i;
>> >> > +	struct drm_framebuffer *fb = state->fb;
>> >> > +	struct vc4_plane_state *vc4_state;
>> >> > +	struct drm_crtc_state *crtc_state;
>> >> > +	unsigned int vscale_factor;
>> >> > +
>> >> > +	vc4_state = to_vc4_plane_state(state);
>> >> > +	crtc_state = drm_atomic_get_existing_crtc_state(state->state,
>> >> > +							state->crtc);
>> >> > +	vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
>> >> > +
>> >> > +	/* The HVS is able to process 2 pixels/cycle when scaling the source,
>> >> > +	 * 4 pixels/cycle otherwise.
>> >> > +	 * Alpha blending step seems to be pipelined and it's always operating
>> >> > +	 * at 4 pixels/cycle, so the limiting aspect here seems to be the
>> >> > +	 * scaler block.
>> >> > +	 * HVS load is expressed in clk-cycles/sec (AKA Hz).
>> >> > +	 */
>> >> > +	if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
>> >> > +	    vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
>> >> > +	    vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
>> >> > +	    vc4_state->y_scaling[1] != VC4_SCALING_NONE)
>> >> > +		hvs_load_shift = 1;
>> >> > +	else
>> >> > +		hvs_load_shift = 2;
>> >> > +
>> >> > +	vc4_state->membus_load = 0;
>> >> > +	vc4_state->hvs_load = 0;
>> >> > +	for (i = 0; i < fb->format->num_planes; i++) {
>> >> > +		unsigned long pixels_load;    
>> >> 
>> >> I'm scared any time I see longs.  Do you want 32 or 64 bits here?  
>> >
>> > I just assumed a 32bit unsigned var would be enough, so unsigned long
>> > seemed just fine. I can use u32 or u64 if you prefer.  
>> 
>> Yes, please.  See also Maxime's recent trouble with a 64-bit kernel.
>
> Will use u32 then.
>
>> 
>> >> > +		/* Even if the bandwidth/plane required for a single frame is
>> >> > +		 *
>> >> > +		 * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
>> >> > +		 *
>> >> > +		 * when downscaling, we have to read more pixels per line in
>> >> > +		 * the time frame reserved for a single line, so the bandwidth
>> >> > +		 * demand can be punctually higher. To account for that, we
>> >> > +		 * calculate the down-scaling factor and multiply the plane
>> >> > +		 * load by this number. We're likely over-estimating the read
>> >> > +		 * demand, but that's better than under-estimating it.
>> >> > +		 */
>> >> > +		vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
>> >> > +					     vc4_state->crtc_h);
>> >> > +		pixels_load = vc4_state->src_w[i] * vc4_state->src_h[i] *
>> >> > +			      vscale_factor;    
>> >> 
>> >> If we're upscaling (common for video, right?), aren't we under-counting
>> >> the cost?  You need to scale/colorspace-convert crtc_w * crtc_h at 2
>> >> pixels per cycle.  
>> >
>> > That's not entirely clear to me. I'm not sure what the scaler does when
>> > upscaling. Are the same pixels read several times from the memory? If
>> > that's the case, then the membus load should indeed be based on the
>> > crtc_w,h.  
>> 
>> I'm going to punt on this question because that would be a *lot* of
>> verilog tracing to figure out for me (and I'm not sure I'd even trust
>> what I came up with).
>> 
>> > Also, when the spec says the HVS can process 4pixels/cycles, is it 4
>> > input pixels or 4 output pixels per cycle?  
>> 
>> Well, it's 4 pixels per cycle when not scaling, so both :)
>
> Sorry, I meant 2pixels/cycle :).
>
>> 
>> I think the scaling pipeline is doing two output pixels per cycle.
>> Nothing else would make sense to me.
>
> Okay, so the HVS load should be based on crtc_w/h and the membus load
> should be based on src_w/h and the scaling ratio, right?

That sounds about right to me.