[Spice-devel,v3,6/6] qxl-wddm-dod: Non-forced memory allocations with VSync

Submitted by Yuri Benditovich on April 8, 2017, 10:46 a.m.

Details

Message ID 1491648415-4684-7-git-send-email-yuri.benditovich@daynix.com
State New
Headers show
Series "Set of patches for further support of VSync" ( rev: 3 ) in Spice

Not browsing as part of any series.

Commit Message

Yuri Benditovich April 8, 2017, 10:46 a.m.
In case of VSync is active (for the driver this means it shall take
in account watchdog policy and ensure fast execution of PresentDisplayOnly
callback) allocate bitmaps for drawable objects using non-forced requests.
If immediate allocation is not possible, place entire bitmap into memory
chunk allocated from the OS.
If bitmap is allocated from device memory, but one of later
chunks can't be allocated, allocate this and further chunks from
OS memory. All these 'delayed' allocations placed into linked list
which root entry is part of QXLOutput structure.
From separate thread, before sending drawable objects down, review
the list of delayed chunks and allocate device memory (forced) to
all of them.
The cost of solution is 2 pointers added to each drawable or cursor
object.
Cursor commands currently do not use them; in future we have an option
to offload also cursor commands.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
---
 qxldod/QxlDod.cpp | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 qxldod/QxlDod.h   |   3 ++
 2 files changed, 103 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/qxldod/QxlDod.cpp b/qxldod/QxlDod.cpp
index c832c93..7ff903a 100755
--- a/qxldod/QxlDod.cpp
+++ b/qxldod/QxlDod.cpp
@@ -4224,6 +4224,13 @@  void QxlDevice::DrawableAddRes(QXLDrawable *drawable, Resource *res)
     AddRes(output, res);
 }
 
+static FORCEINLINE PLIST_ENTRY DelayedList(QXLDrawable *pd)
+{
+    QXLOutput *output;
+    output = (QXLOutput *)((UINT8 *)pd - sizeof(QXLOutput));
+    return &output->list;
+}
+
 void QxlDevice::CursorCmdAddRes(QXLCursorCmd *cmd, Resource *res)
 {
     PAGED_CODE();
@@ -4331,6 +4338,7 @@  QXLDrawable *QxlDevice::Drawable(UINT8 type, CONST RECT *area, CONST RECT *clip,
     drawable->surfaces_dest[1] = - 1;
     drawable->surfaces_dest[2] = -1;
     CopyRect(&drawable->bbox, area);
+    InitializeListHead(DelayedList(drawable));
 
     if (!SetClip(clip, drawable)) {
         DbgPrint(TRACE_LEVEL_VERBOSE, ("%s: set clip failed\n", __FUNCTION__));
@@ -4425,7 +4433,7 @@  BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable *drawable, UINT8 *src, UINT8 *src
     Resource *image_res;
     InternalImage *internal;
     QXLDataChunk *chunk;
-    PLIST_ENTRY pDelayedList = NULL;
+    PLIST_ENTRY pDelayedList = bForce ? NULL : DelayedList(drawable);
     UINT8* dest, *dest_end;
 
     height = drawable->u.copy.src_area.bottom;
@@ -4494,9 +4502,12 @@  BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable *drawable, UINT8 *src, UINT8 *src
     }
 
     for (; src != src_end; src -= pitch, alloc_size -= line_size) {
-        BOOLEAN b = PutBytesAlign(&chunk, &dest, &dest_end, src, line_size, alloc_size, pDelayedList);
-        if (!b) {
-            DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of lines\n", __FUNCTION__));
+        if (!PutBytesAlign(&chunk, &dest, &dest_end, src, line_size, alloc_size, pDelayedList)) {
+            if (pitch < 0 && bForce) {
+                DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of lines (forced)\n", __FUNCTION__));
+            } else {
+                DbgPrint(TRACE_LEVEL_WARNING, ("%s: unexpected aborting copy of lines (force %d, pitch %d)\n", __FUNCTION__, bForce, pitch));
+            }
             return FALSE;
         }
     }
@@ -4551,7 +4562,13 @@  QXLDrawable *QxlDevice::PrepareBltBits (
     UINT8* src_end = src - pSrc->Pitch;
     src += pSrc->Pitch * (height - 1);
 
-    if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch, TRUE)) {
+    if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch, !g_bSupportVSync)) {
+        PLIST_ENTRY pDelayedList = DelayedList(drawable);
+        // if some delayed chunks were allocated, free them
+        while (!IsListEmpty(pDelayedList)) {
+            QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK *)RemoveHeadList(pDelayedList);
+            delete[] reinterpret_cast<BYTE*>(pdc);
+        }
         ReleaseOutput(drawable->release_info.id);
         drawable = NULL;
     } else {
@@ -5179,11 +5196,76 @@  void QxlDevice::StopPresentThread()
     }
 }
 
+QXLDataChunk *QxlDevice::MakeChunk(QXL_DELAYED_CHUNK *pdc)
+{
+    PAGED_CODE();
+    QXLDataChunk *chunk = (QXLDataChunk *)AllocMem(MSPACE_TYPE_VRAM, pdc->chunk.data_size + sizeof(QXLDataChunk), TRUE);
+    if (chunk)
+    {
+        chunk->data_size = pdc->chunk.data_size;
+        chunk->next_chunk = 0;
+        RtlCopyMemory(chunk->data, pdc->chunk.data, chunk->data_size);
+    }
+    return chunk;
+}
+
+ULONG QxlDevice::PrepareDrawable(QXLDrawable*& drawable)
+{
+    PAGED_CODE();
+    ULONG n = 0;
+    BOOLEAN bFail;
+    PLIST_ENTRY pe = DelayedList(drawable);
+    QXLDataChunk *chunk, *lastchunk = NULL;
+
+    bFail = !m_bActive;
+
+    while (!IsListEmpty(pe)) {
+        QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK *)RemoveHeadList(pe);
+        if (!lastchunk) {
+            lastchunk = (QXLDataChunk *)pdc->chunk.prev_chunk;
+        }
+        if (!bFail && !lastchunk) {
+            // bitmap was not allocated, this is single delayed chunk
+            QXL_ASSERT(IsListEmpty(pe));
+
+            if (AttachNewBitmap(
+                drawable,
+                pdc->chunk.data,
+                pdc->chunk.data + pdc->chunk.data_size,
+                -(drawable->u.copy.src_area.right * 4),
+                TRUE)) {
+                ++n;
+            } else {
+                bFail = TRUE;
+            }
+        }
+        if (!bFail && lastchunk) {
+            // some chunks were not allocated
+            chunk = MakeChunk(pdc);
+            if (chunk) {
+                chunk->prev_chunk = PA(lastchunk, m_SurfaceMemSlot);
+                lastchunk->next_chunk = PA(chunk, m_SurfaceMemSlot);
+                lastchunk = chunk;
+                ++n;
+            } else {
+                bFail = TRUE;
+            }
+        }
+        delete[] reinterpret_cast<BYTE*>(pdc);
+    }
+    if (bFail) {
+        ReleaseOutput(drawable->release_info.id);
+        drawable = NULL;
+    }
+    return n;
+}
+
 void QxlDevice::PresentThreadRoutine()
 {
     PAGED_CODE();
     int wait;
     int notify;
+    ULONG delayed = 0;
     QXLDrawable** drawables;
 
     DbgPrint(TRACE_LEVEL_INFORMATION, ("--->%s\n", __FUNCTION__));
@@ -5206,13 +5288,23 @@  void QxlDevice::PresentThreadRoutine()
 
         if (drawables) {
             for (UINT i = 0; drawables[i]; ++i)
-                PushDrawable(drawables[i]);
+            {
+                ULONG n = PrepareDrawable(drawables[i]);
+                // only reason why drawables[i] is zeroed is stop flow
+                if (drawables[i]) {
+                    delayed += n;
+                    PushDrawable(drawables[i]);
+                }
+            }
             delete[] drawables;
-        }
-        else {
+        } else {
             DbgPrint(TRACE_LEVEL_WARNING, ("%s is being terminated\n", __FUNCTION__));
             break;
         }
+        if (delayed) {
+            DbgPrint(TRACE_LEVEL_WARNING, ("%s: %d delayed chunks\n", __FUNCTION__, delayed));
+            delayed = 0;
+        }
     }
 }
 
diff --git a/qxldod/QxlDod.h b/qxldod/QxlDod.h
index 28373b8..c35ec3e 100755
--- a/qxldod/QxlDod.h
+++ b/qxldod/QxlDod.h
@@ -474,6 +474,7 @@  ReleaseMutex(
 #define MAX_OUTPUT_RES 6
 
 typedef struct QXLOutput {
+    LIST_ENTRY list;
     UINT32 num_res;
 #ifdef DBG
     UINT32 type;
@@ -612,6 +613,8 @@  private:
     BOOLEAN PutBytesAlign(QXLDataChunk **chunk_ptr, UINT8 **now_ptr,
                             UINT8 **end_ptr, UINT8 *src, int size,
                             size_t alloc_size, PLIST_ENTRY pDelayed);
+    QXLDataChunk *MakeChunk(QXL_DELAYED_CHUNK *pdc);
+    ULONG PrepareDrawable(QXLDrawable*& drawable);
     void AsyncIo(UCHAR  Port, UCHAR Value);
     void SyncIo(UCHAR  Port, UCHAR Value);
     NTSTATUS UpdateChildStatus(BOOLEAN connect);

Comments

> 
> In case of VSync is active (for the driver this means it shall take
> in account watchdog policy and ensure fast execution of PresentDisplayOnly
> callback) allocate bitmaps for drawable objects using non-forced requests.
> If immediate allocation is not possible, place entire bitmap into memory
> chunk allocated from the OS.
> If bitmap is allocated from device memory, but one of later
> chunks can't be allocated, allocate this and further chunks from
> OS memory. All these 'delayed' allocations placed into linked list
> which root entry is part of QXLOutput structure.
> From separate thread, before sending drawable objects down, review
> the list of delayed chunks and allocate device memory (forced) to
> all of them.
> The cost of solution is 2 pointers added to each drawable or cursor
> object.
> Cursor commands currently do not use them; in future we have an option
> to offload also cursor commands.
> 
> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> ---
>  qxldod/QxlDod.cpp | 108
>  ++++++++++++++++++++++++++++++++++++++++++++++++++----
>  qxldod/QxlDod.h   |   3 ++
>  2 files changed, 103 insertions(+), 8 deletions(-)
> 
> diff --git a/qxldod/QxlDod.cpp b/qxldod/QxlDod.cpp
> index c832c93..7ff903a 100755
> --- a/qxldod/QxlDod.cpp
> +++ b/qxldod/QxlDod.cpp
> @@ -4224,6 +4224,13 @@ void QxlDevice::DrawableAddRes(QXLDrawable *drawable,
> Resource *res)
>      AddRes(output, res);
>  }
>  
> +static FORCEINLINE PLIST_ENTRY DelayedList(QXLDrawable *pd)
> +{
> +    QXLOutput *output;
> +    output = (QXLOutput *)((UINT8 *)pd - sizeof(QXLOutput));
> +    return &output->list;
> +}
> +

This hacky code should just not exist. This assumes that before
every Drawable there is a QXLOutput.
I know there are already similar code but would be better to
use a structure that contains a QXLOutput and a QXLDrawable and
use that.

>  void QxlDevice::CursorCmdAddRes(QXLCursorCmd *cmd, Resource *res)
>  {
>      PAGED_CODE();
> @@ -4331,6 +4338,7 @@ QXLDrawable *QxlDevice::Drawable(UINT8 type, CONST RECT
> *area, CONST RECT *clip,
>      drawable->surfaces_dest[1] = - 1;
>      drawable->surfaces_dest[2] = -1;
>      CopyRect(&drawable->bbox, area);
> +    InitializeListHead(DelayedList(drawable));
>  

Use a constructor?

>      if (!SetClip(clip, drawable)) {
>          DbgPrint(TRACE_LEVEL_VERBOSE, ("%s: set clip failed\n",
>          __FUNCTION__));
> @@ -4425,7 +4433,7 @@ BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable
> *drawable, UINT8 *src, UINT8 *src
>      Resource *image_res;
>      InternalImage *internal;
>      QXLDataChunk *chunk;
> -    PLIST_ENTRY pDelayedList = NULL;
> +    PLIST_ENTRY pDelayedList = bForce ? NULL : DelayedList(drawable);
>      UINT8* dest, *dest_end;
>  
>      height = drawable->u.copy.src_area.bottom;
> @@ -4494,9 +4502,12 @@ BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable
> *drawable, UINT8 *src, UINT8 *src
>      }
>  
>      for (; src != src_end; src -= pitch, alloc_size -= line_size) {
> -        BOOLEAN b = PutBytesAlign(&chunk, &dest, &dest_end, src, line_size,
> alloc_size, pDelayedList);
> -        if (!b) {
> -            DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of lines\n",
> __FUNCTION__));
> +        if (!PutBytesAlign(&chunk, &dest, &dest_end, src, line_size,
> alloc_size, pDelayedList)) {
> +            if (pitch < 0 && bForce) {
> +                DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of lines
> (forced)\n", __FUNCTION__));
> +            } else {
> +                DbgPrint(TRACE_LEVEL_WARNING, ("%s: unexpected aborting copy
> of lines (force %d, pitch %d)\n", __FUNCTION__, bForce, pitch));
> +            }

Why the if on the pitch and not just this last version?

>              return FALSE;
>          }
>      }
> @@ -4551,7 +4562,13 @@ QXLDrawable *QxlDevice::PrepareBltBits (
>      UINT8* src_end = src - pSrc->Pitch;
>      src += pSrc->Pitch * (height - 1);
>  
> -    if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch, TRUE)) {
> +    if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch,
> !g_bSupportVSync)) {
> +        PLIST_ENTRY pDelayedList = DelayedList(drawable);
> +        // if some delayed chunks were allocated, free them
> +        while (!IsListEmpty(pDelayedList)) {
> +            QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK
> *)RemoveHeadList(pDelayedList);
> +            delete[] reinterpret_cast<BYTE*>(pdc);
> +        }

destructor?

>          ReleaseOutput(drawable->release_info.id);

Why ReleaseOutput is not releasing the linked list, which is
in it?

>          drawable = NULL;
>      } else {
> @@ -5179,11 +5196,76 @@ void QxlDevice::StopPresentThread()
>      }
>  }
>  
> +QXLDataChunk *QxlDevice::MakeChunk(QXL_DELAYED_CHUNK *pdc)
> +{
> +    PAGED_CODE();
> +    QXLDataChunk *chunk = (QXLDataChunk *)AllocMem(MSPACE_TYPE_VRAM,
> pdc->chunk.data_size + sizeof(QXLDataChunk), TRUE);
> +    if (chunk)
> +    {
> +        chunk->data_size = pdc->chunk.data_size;
> +        chunk->next_chunk = 0;
> +        RtlCopyMemory(chunk->data, pdc->chunk.data, chunk->data_size);
> +    }
> +    return chunk;
> +}
> +
> +ULONG QxlDevice::PrepareDrawable(QXLDrawable*& drawable)
> +{
> +    PAGED_CODE();
> +    ULONG n = 0;
> +    BOOLEAN bFail;
> +    PLIST_ENTRY pe = DelayedList(drawable);
> +    QXLDataChunk *chunk, *lastchunk = NULL;
> +
> +    bFail = !m_bActive;
> +
> +    while (!IsListEmpty(pe)) {
> +        QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK *)RemoveHeadList(pe);
> +        if (!lastchunk) {
> +            lastchunk = (QXLDataChunk *)pdc->chunk.prev_chunk;
> +        }
> +        if (!bFail && !lastchunk) {
> +            // bitmap was not allocated, this is single delayed chunk
> +            QXL_ASSERT(IsListEmpty(pe));
> +
> +            if (AttachNewBitmap(
> +                drawable,
> +                pdc->chunk.data,
> +                pdc->chunk.data + pdc->chunk.data_size,
> +                -(drawable->u.copy.src_area.right * 4),
> +                TRUE)) {
> +                ++n;
> +            } else {
> +                bFail = TRUE;
> +            }
> +        }
> +        if (!bFail && lastchunk) {
> +            // some chunks were not allocated
> +            chunk = MakeChunk(pdc);
> +            if (chunk) {
> +                chunk->prev_chunk = PA(lastchunk, m_SurfaceMemSlot);
> +                lastchunk->next_chunk = PA(chunk, m_SurfaceMemSlot);
> +                lastchunk = chunk;
> +                ++n;
> +            } else {
> +                bFail = TRUE;
> +            }
> +        }
> +        delete[] reinterpret_cast<BYTE*>(pdc);
> +    }
> +    if (bFail) {
> +        ReleaseOutput(drawable->release_info.id);

This will leak memory if is not the last chunk and some allocations
will fail (in stop flow).


> +        drawable = NULL;
> +    }
> +    return n;
> +}
> +
>  void QxlDevice::PresentThreadRoutine()
>  {
>      PAGED_CODE();
>      int wait;
>      int notify;
> +    ULONG delayed = 0;
>      QXLDrawable** drawables;
>  
>      DbgPrint(TRACE_LEVEL_INFORMATION, ("--->%s\n", __FUNCTION__));
> @@ -5206,13 +5288,23 @@ void QxlDevice::PresentThreadRoutine()
>  
>          if (drawables) {
>              for (UINT i = 0; drawables[i]; ++i)
> -                PushDrawable(drawables[i]);
> +            {
> +                ULONG n = PrepareDrawable(drawables[i]);
> +                // only reason why drawables[i] is zeroed is stop flow
> +                if (drawables[i]) {
> +                    delayed += n;
> +                    PushDrawable(drawables[i]);
> +                }
> +            }
>              delete[] drawables;
> -        }
> -        else {
> +        } else {
>              DbgPrint(TRACE_LEVEL_WARNING, ("%s is being terminated\n",
>              __FUNCTION__));
>              break;
>          }
> +        if (delayed) {
> +            DbgPrint(TRACE_LEVEL_WARNING, ("%s: %d delayed chunks\n",
> __FUNCTION__, delayed));
> +            delayed = 0;

If you move the variable declaration/initialization inside the for
this is not needed.

> +        }
>      }
>  }
>  
> diff --git a/qxldod/QxlDod.h b/qxldod/QxlDod.h
> index 28373b8..c35ec3e 100755
> --- a/qxldod/QxlDod.h
> +++ b/qxldod/QxlDod.h
> @@ -474,6 +474,7 @@ ReleaseMutex(
>  #define MAX_OUTPUT_RES 6
>  
>  typedef struct QXLOutput {
> +    LIST_ENTRY list;
>      UINT32 num_res;
>  #ifdef DBG
>      UINT32 type;
> @@ -612,6 +613,8 @@ private:
>      BOOLEAN PutBytesAlign(QXLDataChunk **chunk_ptr, UINT8 **now_ptr,
>                              UINT8 **end_ptr, UINT8 *src, int size,
>                              size_t alloc_size, PLIST_ENTRY pDelayed);
> +    QXLDataChunk *MakeChunk(QXL_DELAYED_CHUNK *pdc);
> +    ULONG PrepareDrawable(QXLDrawable*& drawable);
>      void AsyncIo(UCHAR  Port, UCHAR Value);
>      void SyncIo(UCHAR  Port, UCHAR Value);
>      NTSTATUS UpdateChildStatus(BOOLEAN connect);

Frediano
On Mon, Apr 10, 2017 at 3:45 PM, Frediano Ziglio <fziglio@redhat.com> wrote:

> >
> > In case of VSync is active (for the driver this means it shall take
> > in account watchdog policy and ensure fast execution of
> PresentDisplayOnly
> > callback) allocate bitmaps for drawable objects using non-forced
> requests.
> > If immediate allocation is not possible, place entire bitmap into memory
> > chunk allocated from the OS.
> > If bitmap is allocated from device memory, but one of later
> > chunks can't be allocated, allocate this and further chunks from
> > OS memory. All these 'delayed' allocations placed into linked list
> > which root entry is part of QXLOutput structure.
> > From separate thread, before sending drawable objects down, review
> > the list of delayed chunks and allocate device memory (forced) to
> > all of them.
> > The cost of solution is 2 pointers added to each drawable or cursor
> > object.
> > Cursor commands currently do not use them; in future we have an option
> > to offload also cursor commands.
> >
> > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > ---
> >  qxldod/QxlDod.cpp | 108
> >  ++++++++++++++++++++++++++++++++++++++++++++++++++----
> >  qxldod/QxlDod.h   |   3 ++
> >  2 files changed, 103 insertions(+), 8 deletions(-)
> >
> > diff --git a/qxldod/QxlDod.cpp b/qxldod/QxlDod.cpp
> > index c832c93..7ff903a 100755
> > --- a/qxldod/QxlDod.cpp
> > +++ b/qxldod/QxlDod.cpp
> > @@ -4224,6 +4224,13 @@ void QxlDevice::DrawableAddRes(QXLDrawable
> *drawable,
> > Resource *res)
> >      AddRes(output, res);
> >  }
> >
> > +static FORCEINLINE PLIST_ENTRY DelayedList(QXLDrawable *pd)
> > +{
> > +    QXLOutput *output;
> > +    output = (QXLOutput *)((UINT8 *)pd - sizeof(QXLOutput));
> > +    return &output->list;
> > +}
> > +
>
> This hacky code should just not exist. This assumes that before
> every Drawable there is a QXLOutput.
>

This is correct assumption, see creation of drawable.
DrawableAddRes does exactly the same


> I know there are already similar code but would be better to
> use a structure that contains a QXLOutput and a QXLDrawable and
> use that.
>
>
This would create more changes in the code than really needed.


> >  void QxlDevice::CursorCmdAddRes(QXLCursorCmd *cmd, Resource *res)
> >  {
> >      PAGED_CODE();
> > @@ -4331,6 +4338,7 @@ QXLDrawable *QxlDevice::Drawable(UINT8 type, CONST
> RECT
> > *area, CONST RECT *clip,
> >      drawable->surfaces_dest[1] = - 1;
> >      drawable->surfaces_dest[2] = -1;
> >      CopyRect(&drawable->bbox, area);
> > +    InitializeListHead(DelayedList(drawable));
> >
>
> Use a constructor?
>

I did not plan to rework all the existing code.
The rule of thumb is as less changes as possible.


>
> >      if (!SetClip(clip, drawable)) {
> >          DbgPrint(TRACE_LEVEL_VERBOSE, ("%s: set clip failed\n",
> >          __FUNCTION__));
> > @@ -4425,7 +4433,7 @@ BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable
> > *drawable, UINT8 *src, UINT8 *src
> >      Resource *image_res;
> >      InternalImage *internal;
> >      QXLDataChunk *chunk;
> > -    PLIST_ENTRY pDelayedList = NULL;
> > +    PLIST_ENTRY pDelayedList = bForce ? NULL : DelayedList(drawable);
> >      UINT8* dest, *dest_end;
> >
> >      height = drawable->u.copy.src_area.bottom;
> > @@ -4494,9 +4502,12 @@ BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable
> > *drawable, UINT8 *src, UINT8 *src
> >      }
> >
> >      for (; src != src_end; src -= pitch, alloc_size -= line_size) {
> > -        BOOLEAN b = PutBytesAlign(&chunk, &dest, &dest_end, src,
> line_size,
> > alloc_size, pDelayedList);
> > -        if (!b) {
> > -            DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of
> lines\n",
> > __FUNCTION__));
> > +        if (!PutBytesAlign(&chunk, &dest, &dest_end, src, line_size,
> > alloc_size, pDelayedList)) {
> > +            if (pitch < 0 && bForce) {
> > +                DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of
> lines
> > (forced)\n", __FUNCTION__));
> > +            } else {
> > +                DbgPrint(TRACE_LEVEL_WARNING, ("%s: unexpected aborting
> copy
> > of lines (force %d, pitch %d)\n", __FUNCTION__, bForce, pitch));
> > +            }
>
> Why the if on the pitch and not just this last version?
>

> >              return FALSE;
> >          }
> >      }
> > @@ -4551,7 +4562,13 @@ QXLDrawable *QxlDevice::PrepareBltBits (
> >      UINT8* src_end = src - pSrc->Pitch;
> >      src += pSrc->Pitch * (height - 1);
> >
> > -    if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch,
> TRUE)) {
> > +    if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch,
> > !g_bSupportVSync)) {
> > +        PLIST_ENTRY pDelayedList = DelayedList(drawable);
> > +        // if some delayed chunks were allocated, free them
> > +        while (!IsListEmpty(pDelayedList)) {
> > +            QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK
> > *)RemoveHeadList(pDelayedList);
> > +            delete[] reinterpret_cast<BYTE*>(pdc);
> > +        }
>
> destructor?
>

Lifecycle of drawable is in general larger than lifecycle of delayed chunks.
In main flow we need to free chunks allocated from OS earlier than we
deallocate the drawable.


>
> >          ReleaseOutput(drawable->release_info.id);
>
> Why ReleaseOutput is not releasing the linked list, which is
> in it?
>
>
The driver in any flow must free each memory allocation from OS (this is
checked by the driver verifier).
Objects allocated from device's memory are released when the ring is being
processed, i.e. much later.
In corner cases we do not release drawable objects at all. So we need to
free the memory allocated from OS before we send the drawable object down.



> >          drawable = NULL;
> >      } else {
> > @@ -5179,11 +5196,76 @@ void QxlDevice::StopPresentThread()
> >      }
> >  }
> >
> > +QXLDataChunk *QxlDevice::MakeChunk(QXL_DELAYED_CHUNK *pdc)
> > +{
> > +    PAGED_CODE();
> > +    QXLDataChunk *chunk = (QXLDataChunk *)AllocMem(MSPACE_TYPE_VRAM,
> > pdc->chunk.data_size + sizeof(QXLDataChunk), TRUE);
> > +    if (chunk)
> > +    {
> > +        chunk->data_size = pdc->chunk.data_size;
> > +        chunk->next_chunk = 0;
> > +        RtlCopyMemory(chunk->data, pdc->chunk.data, chunk->data_size);
> > +    }
> > +    return chunk;
> > +}
> > +
> > +ULONG QxlDevice::PrepareDrawable(QXLDrawable*& drawable)
> > +{
> > +    PAGED_CODE();
> > +    ULONG n = 0;
> > +    BOOLEAN bFail;
> > +    PLIST_ENTRY pe = DelayedList(drawable);
> > +    QXLDataChunk *chunk, *lastchunk = NULL;
> > +
> > +    bFail = !m_bActive;
> > +
> > +    while (!IsListEmpty(pe)) {
> > +        QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK
> *)RemoveHeadList(pe);
> > +        if (!lastchunk) {
> > +            lastchunk = (QXLDataChunk *)pdc->chunk.prev_chunk;
> > +        }
> > +        if (!bFail && !lastchunk) {
> > +            // bitmap was not allocated, this is single delayed chunk
> > +            QXL_ASSERT(IsListEmpty(pe));
> > +
> > +            if (AttachNewBitmap(
> > +                drawable,
> > +                pdc->chunk.data,
> > +                pdc->chunk.data + pdc->chunk.data_size,
> > +                -(drawable->u.copy.src_area.right * 4),
> > +                TRUE)) {
> > +                ++n;
> > +            } else {
> > +                bFail = TRUE;
> > +            }
> > +        }
> > +        if (!bFail && lastchunk) {
> > +            // some chunks were not allocated
> > +            chunk = MakeChunk(pdc);
> > +            if (chunk) {
> > +                chunk->prev_chunk = PA(lastchunk, m_SurfaceMemSlot);
> > +                lastchunk->next_chunk = PA(chunk, m_SurfaceMemSlot);
> > +                lastchunk = chunk;
> > +                ++n;
> > +            } else {
> > +                bFail = TRUE;
> > +            }
> > +        }
> > +        delete[] reinterpret_cast<BYTE*>(pdc);
> > +    }
> > +    if (bFail) {
> > +        ReleaseOutput(drawable->release_info.id);
>
> This will leak memory if is not the last chunk and some allocations
> will fail (in stop flow).
>

I do not see any leak here. We leave the loop after we free all the
allocations from OS.
If there is no last chunk, the list contains just one entry, there is
entire bitmap allocated from OS (see ASSERT) and if some HW chunks were
allocated from device memory during unsuccessful AttachNewBitmap, they will
be freed by ReleaseOutput.


>
>
> > +        drawable = NULL;
> > +    }
> > +    return n;
> > +}
> > +
> >  void QxlDevice::PresentThreadRoutine()
> >  {
> >      PAGED_CODE();
> >      int wait;
> >      int notify;
> > +    ULONG delayed = 0;
> >      QXLDrawable** drawables;
> >
> >      DbgPrint(TRACE_LEVEL_INFORMATION, ("--->%s\n", __FUNCTION__));
> > @@ -5206,13 +5288,23 @@ void QxlDevice::PresentThreadRoutine()
> >
> >          if (drawables) {
> >              for (UINT i = 0; drawables[i]; ++i)
> > -                PushDrawable(drawables[i]);
> > +            {
> > +                ULONG n = PrepareDrawable(drawables[i]);
> > +                // only reason why drawables[i] is zeroed is stop flow
> > +                if (drawables[i]) {
> > +                    delayed += n;
> > +                    PushDrawable(drawables[i]);
> > +                }
> > +            }
> >              delete[] drawables;
> > -        }
> > -        else {
> > +        } else {
> >              DbgPrint(TRACE_LEVEL_WARNING, ("%s is being terminated\n",
> >              __FUNCTION__));
> >              break;
> >          }
> > +        if (delayed) {
> > +            DbgPrint(TRACE_LEVEL_WARNING, ("%s: %d delayed chunks\n",
> > __FUNCTION__, delayed));
> > +            delayed = 0;
>
> If you move the variable declaration/initialization inside the for
> this is not needed.
>

This would create more printouts in case the array contains several/many
drawables.


>
> > +        }
> >      }
> >  }
> >
> > diff --git a/qxldod/QxlDod.h b/qxldod/QxlDod.h
> > index 28373b8..c35ec3e 100755
> > --- a/qxldod/QxlDod.h
> > +++ b/qxldod/QxlDod.h
> > @@ -474,6 +474,7 @@ ReleaseMutex(
> >  #define MAX_OUTPUT_RES 6
> >
> >  typedef struct QXLOutput {
> > +    LIST_ENTRY list;
> >      UINT32 num_res;
> >  #ifdef DBG
> >      UINT32 type;
> > @@ -612,6 +613,8 @@ private:
> >      BOOLEAN PutBytesAlign(QXLDataChunk **chunk_ptr, UINT8 **now_ptr,
> >                              UINT8 **end_ptr, UINT8 *src, int size,
> >                              size_t alloc_size, PLIST_ENTRY pDelayed);
> > +    QXLDataChunk *MakeChunk(QXL_DELAYED_CHUNK *pdc);
> > +    ULONG PrepareDrawable(QXLDrawable*& drawable);
> >      void AsyncIo(UCHAR  Port, UCHAR Value);
> >      void SyncIo(UCHAR  Port, UCHAR Value);
> >      NTSTATUS UpdateChildStatus(BOOLEAN connect);
>
> Frediano
>
> On Mon, Apr 10, 2017 at 3:45 PM, Frediano Ziglio < fziglio@redhat.com >
> wrote:

> > >
> 
> > > In case of VSync is active (for the driver this means it shall take
> 
> > > in account watchdog policy and ensure fast execution of
> > > PresentDisplayOnly
> 
> > > callback) allocate bitmaps for drawable objects using non-forced
> > > requests.
> 
> > > If immediate allocation is not possible, place entire bitmap into memory
> 
> > > chunk allocated from the OS.
> 
> > > If bitmap is allocated from device memory, but one of later
> 
> > > chunks can't be allocated, allocate this and further chunks from
> 
> > > OS memory. All these 'delayed' allocations placed into linked list
> 
> > > which root entry is part of QXLOutput structure.
> 
> > > From separate thread, before sending drawable objects down, review
> 
> > > the list of delayed chunks and allocate device memory (forced) to
> 
> > > all of them.
> 
> > > The cost of solution is 2 pointers added to each drawable or cursor
> 
> > > object.
> 
> > > Cursor commands currently do not use them; in future we have an option
> 
> > > to offload also cursor commands.
> 
> > >
> 
> > > Signed-off-by: Yuri Benditovich < yuri.benditovich@daynix.com >
> 
> > > ---
> 
> > > qxldod/QxlDod.cpp | 108
> 
> > > ++++++++++++++++++++++++++++++++++++++++++++++++++----
> 
> > > qxldod/QxlDod.h | 3 ++
> 
> > > 2 files changed, 103 insertions(+), 8 deletions(-)
> 
> > >
> 
> > > diff --git a/qxldod/QxlDod.cpp b/qxldod/QxlDod.cpp
> 
> > > index c832c93..7ff903a 100755
> 
> > > --- a/qxldod/QxlDod.cpp
> 
> > > +++ b/qxldod/QxlDod.cpp
> 
> > > @@ -4224,6 +4224,13 @@ void QxlDevice::DrawableAddRes(QXLDrawable
> > > *drawable,
> 
> > > Resource *res)
> 
> > > AddRes(output, res);
> 
> > > }
> 
> > >
> 
> > > +static FORCEINLINE PLIST_ENTRY DelayedList(QXLDrawable *pd)
> 
> > > +{
> 
> > > + QXLOutput *output;
> 
> > > + output = (QXLOutput *)((UINT8 *)pd - sizeof(QXLOutput));
> 
> > > + return &output->list;
> 
> > > +}
> 
> > > +
> 

> > This hacky code should just not exist. This assumes that before
> 
> > every Drawable there is a QXLOutput.
> 

> This is correct assumption, see creation of drawable.
> DrawableAddRes does exactly the same

> > I know there are already similar code but would be better to
> 
> > use a structure that contains a QXLOutput and a QXLDrawable and
> 
> > use that.
> 

> This would create more changes in the code than really needed.

I agree for this series but would like to have a proper style document for this driver 
in a near future. 

> > > void QxlDevice::CursorCmdAddRes(QXLCursorCmd *cmd, Resource *res)
> 
> > > {
> 
> > > PAGED_CODE();
> 
> > > @@ -4331,6 +4338,7 @@ QXLDrawable *QxlDevice::Drawable(UINT8 type, CONST
> > > RECT
> 
> > > *area, CONST RECT *clip,
> 
> > > drawable->surfaces_dest[1] = - 1;
> 
> > > drawable->surfaces_dest[2] = -1;
> 
> > > CopyRect(&drawable->bbox, area);
> 
> > > + InitializeListHead(DelayedList(drawable));
> 
> > >
> 

> > Use a constructor?
> 

> I did not plan to rework all the existing code.
> The rule of thumb is as less changes as possible.

Well... this is new code but on the other way calling the constructor at the moment 
could take some more code so agreed for this series. 

> > > if (!SetClip(clip, drawable)) {
> 
> > > DbgPrint(TRACE_LEVEL_VERBOSE, ("%s: set clip failed\n",
> 
> > > __FUNCTION__));
> 
> > > @@ -4425,7 +4433,7 @@ BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable
> 
> > > *drawable, UINT8 *src, UINT8 *src
> 
> > > Resource *image_res;
> 
> > > InternalImage *internal;
> 
> > > QXLDataChunk *chunk;
> 
> > > - PLIST_ENTRY pDelayedList = NULL;
> 
> > > + PLIST_ENTRY pDelayedList = bForce ? NULL : DelayedList(drawable);
> 
> > > UINT8* dest, *dest_end;
> 
> > >
> 
> > > height = drawable->u.copy.src_area.bottom;
> 
> > > @@ -4494,9 +4502,12 @@ BOOLEAN QxlDevice::AttachNewBitmap(QXLDrawable
> 
> > > *drawable, UINT8 *src, UINT8 *src
> 
> > > }
> 
> > >
> 
> > > for (; src != src_end; src -= pitch, alloc_size -= line_size) {
> 
> > > - BOOLEAN b = PutBytesAlign(&chunk, &dest, &dest_end, src, line_size,
> 
> > > alloc_size, pDelayedList);
> 
> > > - if (!b) {
> 
> > > - DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of lines\n",
> 
> > > __FUNCTION__));
> 
> > > + if (!PutBytesAlign(&chunk, &dest, &dest_end, src, line_size,
> 
> > > alloc_size, pDelayedList)) {
> 
> > > + if (pitch < 0 && bForce) {
> 
> > > + DbgPrint(TRACE_LEVEL_WARNING, ("%s: aborting copy of lines
> 
> > > (forced)\n", __FUNCTION__));
> 
> > > + } else {
> 
> > > + DbgPrint(TRACE_LEVEL_WARNING, ("%s: unexpected aborting copy
> 
> > > of lines (force %d, pitch %d)\n", __FUNCTION__, bForce, pitch));
> 
> > > + }
> 

> > Why the if on the pitch and not just this last version?
> 

> > > return FALSE;
> 
> > > }
> 
> > > }
> 
> > > @@ -4551,7 +4562,13 @@ QXLDrawable *QxlDevice::PrepareBltBits (
> 
> > > UINT8* src_end = src - pSrc->Pitch;
> 
> > > src += pSrc->Pitch * (height - 1);
> 
> > >
> 
> > > - if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch, TRUE)) {
> 
> > > + if (!AttachNewBitmap(drawable, src, src_end, (INT)pSrc->Pitch,
> 
> > > !g_bSupportVSync)) {
> 
> > > + PLIST_ENTRY pDelayedList = DelayedList(drawable);
> 
> > > + // if some delayed chunks were allocated, free them
> 
> > > + while (!IsListEmpty(pDelayedList)) {
> 
> > > + QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK
> 
> > > *)RemoveHeadList(pDelayedList);
> 
> > > + delete[] reinterpret_cast<BYTE*>(pdc);
> 
> > > + }
> 

> > destructor?
> 

> Lifecycle of drawable is in general larger than lifecycle of delayed chunks.
> In main flow we need to free chunks allocated from OS earlier than we
> deallocate the drawable.

> > > ReleaseOutput(drawable-> release_info.id );
> 

> > Why ReleaseOutput is not releasing the linked list, which is
> 
> > in it?
> 

> The driver in any flow must free each memory allocation from OS (this is
> checked by the driver verifier).
> Objects allocated from device's memory are released when the ring is being
> processed, i.e. much later.
> In corner cases we do not release drawable objects at all. So we need to free
> the memory allocated from OS before we send the drawable object down.

I would add this to the style comments above. 

> > > drawable = NULL;
> 
> > > } else {
> 
> > > @@ -5179,11 +5196,76 @@ void QxlDevice::StopPresentThread()
> 
> > > }
> 
> > > }
> 
> > >
> 
> > > +QXLDataChunk *QxlDevice::MakeChunk(QXL_DELAYED_CHUNK *pdc)
> 
> > > +{
> 
> > > + PAGED_CODE();
> 
> > > + QXLDataChunk *chunk = (QXLDataChunk *)AllocMem(MSPACE_TYPE_VRAM,
> 
> > > pdc->chunk.data_size + sizeof(QXLDataChunk), TRUE);
> 
> > > + if (chunk)
> 
> > > + {
> 
> > > + chunk->data_size = pdc->chunk.data_size;
> 
> > > + chunk->next_chunk = 0;
> 
> > > + RtlCopyMemory(chunk->data, pdc->chunk.data, chunk->data_size);
> 
> > > + }
> 
> > > + return chunk;
> 
> > > +}
> 
> > > +
> 
> > > +ULONG QxlDevice::PrepareDrawable(QXLDrawable*& drawable)
> 
> > > +{
> 
> > > + PAGED_CODE();
> 
> > > + ULONG n = 0;
> 
> > > + BOOLEAN bFail;
> 
> > > + PLIST_ENTRY pe = DelayedList(drawable);
> 
> > > + QXLDataChunk *chunk, *lastchunk = NULL;
> 
> > > +
> 
> > > + bFail = !m_bActive;
> 
> > > +
> 
> > > + while (!IsListEmpty(pe)) {
> 
> > > + QXL_DELAYED_CHUNK *pdc = (QXL_DELAYED_CHUNK *)RemoveHeadList(pe);
> 
> > > + if (!lastchunk) {
> 
> > > + lastchunk = (QXLDataChunk *)pdc->chunk.prev_chunk;
> 
> > > + }
> 
> > > + if (!bFail && !lastchunk) {
> 
> > > + // bitmap was not allocated, this is single delayed chunk
> 
> > > + QXL_ASSERT(IsListEmpty(pe));
> 
> > > +
> 
> > > + if (AttachNewBitmap(
> 
> > > + drawable,
> 
> > > + pdc->chunk.data,
> 
> > > + pdc->chunk.data + pdc->chunk.data_size,
> 
> > > + -(drawable->u.copy.src_area.right * 4),
> 
> > > + TRUE)) {
> 
> > > + ++n;
> 
> > > + } else {
> 
> > > + bFail = TRUE;
> 
> > > + }
> 
> > > + }
> 
> > > + if (!bFail && lastchunk) {
> 
> > > + // some chunks were not allocated
> 
> > > + chunk = MakeChunk(pdc);
> 
> > > + if (chunk) {
> 
> > > + chunk->prev_chunk = PA(lastchunk, m_SurfaceMemSlot);
> 
> > > + lastchunk->next_chunk = PA(chunk, m_SurfaceMemSlot);
> 
> > > + lastchunk = chunk;
> 
> > > + ++n;
> 
> > > + } else {
> 
> > > + bFail = TRUE;
> 
> > > + }
> 
> > > + }
> 
> > > + delete[] reinterpret_cast<BYTE*>(pdc);
> 
> > > + }
> 
> > > + if (bFail) {
> 
> > > + ReleaseOutput(drawable-> release_info.id );
> 

> > This will leak memory if is not the last chunk and some allocations
> 
> > will fail (in stop flow).
> 

> I do not see any leak here. We leave the loop after we free all the
> allocations from OS.
> If there is no last chunk, the list contains just one entry, there is entire
> bitmap allocated from OS (see ASSERT) and if some HW chunks were allocated
> from device memory during unsuccessful AttachNewBitmap, they will be freed
> by ReleaseOutput.

Yes, you are right. 
I was actually thinking about a break or exit condition. 

> > > + drawable = NULL;
> 
> > > + }
> 
> > > + return n;
> 
> > > +}
> 
> > > +
> 
> > > void QxlDevice::PresentThreadRoutine()
> 
> > > {
> 
> > > PAGED_CODE();
> 
> > > int wait;
> 
> > > int notify;
> 
> > > + ULONG delayed = 0;
> 
> > > QXLDrawable** drawables;
> 
> > >
> 
> > > DbgPrint(TRACE_LEVEL_INFORMATION, ("--->%s\n", __FUNCTION__));
> 
> > > @@ -5206,13 +5288,23 @@ void QxlDevice::PresentThreadRoutine()
> 
> > >
> 
> > > if (drawables) {
> 
> > > for (UINT i = 0; drawables[i]; ++i)
> 
> > > - PushDrawable(drawables[i]);
> 
> > > + {
> 
> > > + ULONG n = PrepareDrawable(drawables[i]);
> 
> > > + // only reason why drawables[i] is zeroed is stop flow
> 
> > > + if (drawables[i]) {
> 
> > > + delayed += n;
> 
> > > + PushDrawable(drawables[i]);
> 
> > > + }
> 
> > > + }
> 
> > > delete[] drawables;
> 
> > > - }
> 
> > > - else {
> 
> > > + } else {
> 
> > > DbgPrint(TRACE_LEVEL_WARNING, ("%s is being terminated\n",
> 
> > > __FUNCTION__));
> 
> > > break;
> 
> > > }
> 
> > > + if (delayed) {
> 
> > > + DbgPrint(TRACE_LEVEL_WARNING, ("%s: %d delayed chunks\n",
> 
> > > __FUNCTION__, delayed));
> 
> > > + delayed = 0;
> 

> > If you move the variable declaration/initialization inside the for
> 
> > this is not needed.
> 

> This would create more printouts in case the array contains several/many
> drawables.

No. But is just question of style. 

> > > + }
> 
> > > }
> 
> > > }
> 
> > >
> 
> > > diff --git a/qxldod/QxlDod.h b/qxldod/QxlDod.h
> 
> > > index 28373b8..c35ec3e 100755
> 
> > > --- a/qxldod/QxlDod.h
> 
> > > +++ b/qxldod/QxlDod.h
> 
> > > @@ -474,6 +474,7 @@ ReleaseMutex(
> 
> > > #define MAX_OUTPUT_RES 6
> 
> > >
> 
> > > typedef struct QXLOutput {
> 
> > > + LIST_ENTRY list;
> 
> > > UINT32 num_res;
> 
> > > #ifdef DBG
> 
> > > UINT32 type;
> 

Not that I like much other pointers put in device memory but for the moment is not a regression 
and is not causing possible security concern (host had access to guest memory). 

> > > @@ -612,6 +613,8 @@ private:
> 
> > > BOOLEAN PutBytesAlign(QXLDataChunk **chunk_ptr, UINT8 **now_ptr,
> 
> > > UINT8 **end_ptr, UINT8 *src, int size,
> 
> > > size_t alloc_size, PLIST_ENTRY pDelayed);
> 
> > > + QXLDataChunk *MakeChunk(QXL_DELAYED_CHUNK *pdc);
> 
> > > + ULONG PrepareDrawable(QXLDrawable*& drawable);
> 
> > > void AsyncIo(UCHAR Port, UCHAR Value);
> 
> > > void SyncIo(UCHAR Port, UCHAR Value);
> 
> > > NTSTATUS UpdateChildStatus(BOOLEAN connect);
> 

So for the time being I'll accept this, beside the structure name that I would change (QXL_DELAYED_CHUNK). 

Frediano