[4/4] radeonsi: use SDMA for uploading data through const_uploader

Submitted by Marek Olšák on Feb. 20, 2019, 3:04 a.m.

Details

Message ID CAAxE2A5paM7HyX0=KfRTjecyfe74CJUMC-4pPEWqrV0zYnYCjQ@mail.gmail.com
State New
Headers show
Series "RadeonSI: Upload constants to VRAM via SDMA" ( rev: 2 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák Feb. 20, 2019, 3:04 a.m.
I'll remove the env var.

Additionally, I'm amending this:

+                               uploader = sctx->b.stream_uploader;

-                       u_upload_alloc(ctx->stream_uploader, 0,
+                       u_upload_alloc(uploader, 0,
                                        box->width + (box->x %
SI_MAP_BUFFER_ALIGNMENT),

sctx->screen->info.tcc_cache_line_size,
                                       &offset, (struct
pipe_resource**)&staging,

Marek

On Mon, Feb 11, 2019 at 4:38 AM Nicolai Hähnle <nhaehnle@gmail.com> wrote:

> On 07.02.19 02:22, Marek Olšák wrote:
> > +     bool use_sdma_upload = sscreen->info.has_dedicated_vram &&
> sctx->dma_cs && debug_get_bool_option("SDMA", true);
>
> Could you please namespace the environment variable, e.g. RADEONSI_SDMA?
>
> Apart from that, series is
>
> Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
>
>
> > +     sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
> > +                                              0, PIPE_USAGE_DEFAULT,
> > +                                              SI_RESOURCE_FLAG_32BIT |
> > +                                              (use_sdma_upload ?
> > +
>  SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
> > +
>  (sscreen->cpdma_prefetch_writes_memory ?
> > +                                                                0 :
> SI_RESOURCE_FLAG_READ_ONLY)));
> > +     if (!sctx->b.const_uploader)
> > +             goto fail;
> > +
> > +     if (use_sdma_upload)
> > +             u_upload_enable_flush_explicit(sctx->b.const_uploader);
> > +
> >       si_init_buffer_functions(sctx);
> >       si_init_clear_functions(sctx);
> >       si_init_blit_functions(sctx);
> >       si_init_compute_functions(sctx);
> >       si_init_compute_blit_functions(sctx);
> >       si_init_debug_functions(sctx);
> >       si_init_msaa_functions(sctx);
> >       si_init_streamout_functions(sctx);
> >
> >       if (sscreen->info.has_hw_decode) {
> > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> > index b01d5744752..b208bdeb848 100644
> > --- a/src/gallium/drivers/radeonsi/si_pipe.h
> > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> > @@ -103,20 +103,22 @@
> >   #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
> >
> >   #define SI_RESOURCE_FLAG_TRANSFER   (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
> >   #define SI_RESOURCE_FLAG_FLUSHED_DEPTH
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
> >   #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
> >   #define SI_RESOURCE_FLAG_DISABLE_DCC
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
> >   #define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
> >   #define SI_RESOURCE_FLAG_READ_ONLY  (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
> >   #define SI_RESOURCE_FLAG_32BIT
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
> >   #define SI_RESOURCE_FLAG_CLEAR
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
> > +/* For const_uploader, upload data via GTT and copy to VRAM on context
> flush via SDMA. */
> > +#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA
> (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
> >
> >   enum si_clear_code
> >   {
> >       DCC_CLEAR_COLOR_0000   = 0x00000000,
> >       DCC_CLEAR_COLOR_0001   = 0x40404040,
> >       DCC_CLEAR_COLOR_1110   = 0x80808080,
> >       DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,
> >       DCC_CLEAR_COLOR_REG    = 0x20202020,
> >       DCC_UNCOMPRESSED       = 0xFFFFFFFF,
> >   };
> > @@ -769,20 +771,28 @@ struct si_saved_cs {
> >       struct si_context       *ctx;
> >       struct radeon_saved_cs  gfx;
> >       struct si_resource      *trace_buf;
> >       unsigned                trace_id;
> >
> >       unsigned                gfx_last_dw;
> >       bool                    flushed;
> >       int64_t                 time_flush;
> >   };
> >
> > +struct si_sdma_upload {
> > +     struct si_resource      *dst;
> > +     struct si_resource      *src;
> > +     unsigned                src_offset;
> > +     unsigned                dst_offset;
> > +     unsigned                size;
> > +};
> > +
> >   struct si_context {
> >       struct pipe_context             b; /* base class */
> >
> >       enum radeon_family              family;
> >       enum chip_class                 chip_class;
> >
> >       struct radeon_winsys            *ws;
> >       struct radeon_winsys_ctx        *ctx;
> >       struct radeon_cmdbuf            *gfx_cs;
> >       struct radeon_cmdbuf            *dma_cs;
> > @@ -1074,20 +1084,26 @@ struct si_context {
> >       int                             num_perfect_occlusion_queries;
> >       struct list_head                active_queries;
> >       unsigned                        num_cs_dw_queries_suspend;
> >
> >       /* Render condition. */
> >       struct pipe_query               *render_cond;
> >       unsigned                        render_cond_mode;
> >       bool                            render_cond_invert;
> >       bool                            render_cond_force_off; /* for
> u_blitter */
> >
> > +     /* For uploading data via GTT and copy to VRAM on context flush
> via SDMA. */
> > +     bool                            sdma_uploads_in_progress;
> > +     struct si_sdma_upload           *sdma_uploads;
> > +     unsigned                        num_sdma_uploads;
> > +     unsigned                        max_sdma_uploads;
> > +
> >       /* Statistics gathering for the DCC enablement heuristic. It can't
> be
> >        * in si_texture because si_texture can be shared by multiple
> >        * contexts. This is for back buffers only. We shouldn't get too
> many
> >        * of those.
> >        *
> >        * X11 DRI3 rotates among a finite set of back buffers. They should
> >        * all fit in this array. If they don't, separate DCC might never
> be
> >        * enabled by DCC stat gathering.
> >        */
> >       struct {
> > @@ -1273,20 +1289,21 @@ struct pipe_fence_handle *si_create_fence(struct
> pipe_context *ctx,
> >                                         struct tc_unflushed_batch_token
> *tc_token);
> >
> >   /* si_get.c */
> >   void si_init_screen_get_functions(struct si_screen *sscreen);
> >
> >   /* si_gfx_cs.c */
> >   void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
> >                    struct pipe_fence_handle **fence);
> >   void si_begin_new_gfx_cs(struct si_context *ctx);
> >   void si_need_gfx_cs_space(struct si_context *ctx);
> > +void si_unref_sdma_uploads(struct si_context *sctx);
> >
> >   /* si_gpu_load.c */
> >   void si_gpu_load_kill_thread(struct si_screen *sscreen);
> >   uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
> >   unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
> >                       uint64_t begin);
> >
> >   /* si_compute.c */
> >   void si_init_compute_functions(struct si_context *sctx);
> >
> >
>
> --
> Lerne, wie die Welt wirklich ist,
> Aber vergiss niemals, wie sie sein sollte.
>

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_buffer.c
b/src/gallium/drivers/radeonsi/si_buffer.c
index 3f8db7cf4f0..4936eb5a5b1 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -461,10 +461,20 @@  static void *si_buffer_transfer_map(struct
pipe_context *ctx,
                    si_rings_is_buffer_referenced(sctx, buf->buf,
RADEON_USAGE_READWRITE) ||
                    !sctx->ws->buffer_wait(buf->buf, 0,
RADEON_USAGE_READWRITE)) {
                        /* Do a wait-free write-only transfer using a
temporary buffer. */
-                       unsigned offset;
+                       struct u_upload_mgr *uploader;
                        struct si_resource *staging = NULL;
+                       unsigned offset;
+
+                       /* If we are not called from the driver thread, we
have
+                        * to use the uploader from u_threaded_context,
which is
+                        * local to the calling thread.
+                        */
+                       if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+                               uploader = sctx->tc->base.stream_uploader;
+                       else