Make in-order command queues actually be in-order

Submitted by Rebecca N. Palmer on July 21, 2018, 7:12 p.m.

Details

Message ID e09c7490-70af-217f-a785-161546772942@zoho.com
State New
Headers show
Series "Make in-order command queues actually be in-order" ( rev: 1 ) in Beignet

Not browsing as part of any series.

Commit Message

Rebecca N. Palmer July 21, 2018, 7:12 p.m.
When beignet added out-of-order execution support (7fd45f15),
it made *all* command queues out-of-order, even if they were
created as (and are reported by clGetCommandQueueInfo as) in-order.

Signed-off-by: Rebecca N. Palmer <rebecca_palmer@zoho.com>
---
Not sure whether this one is actually worth it: it's clearly
against the spec, but I'm not aware of it causing any
real-world bugs.  (I noticed it while investigating
an issue that turned out to be unrelated.)  Users who expect a
queue to be in-order are probably not using events, and that
makes a beignet queue effectively in-order.

(This is *not* true of out-of-order queues in some other ICDs,
e.g. pocl: it is true in Beignet because our flush (in particular
the implicit one before a blocking copy) is also an ordering
barrier, but the spec doesn't require that.  If you choose not to
take this, it might be a good idea to add a comment to
cl_command_queue_wait_flush documenting that.)

Patch hide | download patch | download mbox

--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -283,7 +283,7 @@  clEnqueueSVMFree (cl_command_queue comma
     data->size      = num_svm_pointers;
     data->ptr       = user_data;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -429,7 +429,7 @@  cl_int clEnqueueSVMMemcpy (cl_command_qu
     data->const_ptr    = src_ptr;
     data->size         = size;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -441,6 +441,9 @@  cl_int clEnqueueSVMMemcpy (cl_command_qu
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_copy) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while(0);
 
@@ -518,7 +521,7 @@  cl_int clEnqueueSVMMemFill (cl_command_q
     data->pattern_size = pattern_size;
     data->size         = size;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
--- a/src/cl_api_kernel.c
+++ b/src/cl_api_kernel.c
@@ -223,6 +223,7 @@  clEnqueueNDRangeKernel(cl_command_queue
     count *= global_wk_sz_rem[2] ? 2 : 1;
 
     const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+    cl_bool allow_immediate_submit = cl_command_queue_allow_bypass_submit(command_queue);
     /* Go through the at most 8 cases and euque if there is work items left */
     for (i = 0; i < 2; i++) {
       for (j = 0; j < 2; j++) {
@@ -263,7 +264,7 @@  clEnqueueNDRangeKernel(cl_command_queue
             break;
           }
 
-          err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
+          err = cl_event_exec(e, ((allow_immediate_submit && event_status == CL_COMPLETE) ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
           if (err != CL_SUCCESS) {
             break;
           }
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -309,7 +309,7 @@  clEnqueueMapBuffer(cl_command_queue comm
     if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
       data->write_map = 1;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -322,6 +322,9 @@  clEnqueueMapBuffer(cl_command_queue comm
       }
 
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_map) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
 
     ptr = data->ptr;
@@ -469,7 +472,7 @@  clEnqueueUnmapMemObject(cl_command_queue
     data->mem_obj = memobj;
     data->ptr = mapped_ptr;
 
-    if (e_status == CL_COMPLETE) { // No need to wait
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { // No need to wait
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
         break;
@@ -571,7 +574,7 @@  clEnqueueReadBuffer(cl_command_queue com
     data->offset = offset;
     data->size = size;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -583,6 +586,9 @@  clEnqueueReadBuffer(cl_command_queue com
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_read) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while (0);
 
@@ -674,7 +680,7 @@  clEnqueueWriteBuffer(cl_command_queue co
     data->offset = offset;
     data->size = size;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -686,6 +692,9 @@  clEnqueueWriteBuffer(cl_command_queue co
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_write) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while (0);
 
@@ -823,7 +832,7 @@  clEnqueueReadBufferRect(cl_command_queue
     data->host_row_pitch = host_row_pitch;
     data->host_slice_pitch = host_slice_pitch;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -835,6 +844,9 @@  clEnqueueReadBufferRect(cl_command_queue
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_read) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while (0);
 
@@ -974,7 +986,7 @@  clEnqueueWriteBufferRect(cl_command_queu
     data->host_row_pitch = host_row_pitch;
     data->host_slice_pitch = host_slice_pitch;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -986,6 +998,9 @@  clEnqueueWriteBufferRect(cl_command_queu
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_write) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while (0);
 
@@ -1093,7 +1108,7 @@  clEnqueueCopyBuffer(cl_command_queue com
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -1283,7 +1298,7 @@  clEnqueueCopyBufferRect(cl_command_queue
     if (e_status < CL_COMPLETE) { // Error happend, cancel.
       err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
       break;
-    } else if (e_status == CL_COMPLETE) {
+    } else if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       err = cl_event_exec(e, CL_SUBMITTED, CL_FALSE);
       if (err != CL_SUCCESS) {
         break;
@@ -1384,7 +1399,7 @@  clEnqueueFillBuffer(cl_command_queue com
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -1471,7 +1486,7 @@  clEnqueueMigrateMemObjects(cl_command_qu
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -1764,7 +1779,7 @@  clEnqueueMapImage(cl_command_queue comma
     if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
       data->write_map = 1;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -1777,6 +1792,9 @@  clEnqueueMapImage(cl_command_queue comma
       }
 
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_map) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
 
     ptr = data->ptr;
@@ -2014,7 +2032,7 @@  clEnqueueReadImage(cl_command_queue comm
     data->row_pitch = row_pitch;
     data->slice_pitch = slice_pitch;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -2026,6 +2044,9 @@  clEnqueueReadImage(cl_command_queue comm
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_read) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while (0);
 
@@ -2218,7 +2239,7 @@  clEnqueueWriteImage(cl_command_queue com
     data->row_pitch = row_pitch;
     data->slice_pitch = slice_pitch;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -2230,6 +2251,9 @@  clEnqueueWriteImage(cl_command_queue com
         break;
       }
       cl_command_queue_enqueue_event(command_queue, e);
+      if (blocking_write) {
+        cl_event_wait_for_events_list(1, &e);
+      }
     }
   } while (0);
 
@@ -2364,7 +2388,7 @@  clEnqueueCopyImage(cl_command_queue comm
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -2475,7 +2499,7 @@  clEnqueueCopyImageToBuffer(cl_command_qu
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -2587,7 +2611,7 @@  clEnqueueCopyBufferToImage(cl_command_qu
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -2697,7 +2721,7 @@  clEnqueueFillImage(cl_command_queue comm
       break;
     }
 
-    err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
+    err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
     if (err != CL_SUCCESS) {
       break;
     }
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -103,6 +103,11 @@  extern cl_int cl_command_queue_wait_fini
 extern cl_int cl_command_queue_wait_flush(cl_command_queue queue);
 /* Note: Must call this function with queue's lock. */
 extern cl_event *cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num);
+/* Whether it is valid to call cl_event_exec directly, instead of cl_command_queue_enqueue_event */
+static inline cl_bool cl_command_queue_allow_bypass_submit(cl_command_queue queue){
+  return (queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)/* if out-of-order, always */
+    || list_empty(&queue->worker.enqueued_events);/* if in-order, only if empty */
+}
 
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
--- a/src/cl_command_queue_enqueue.c
+++ b/src/cl_command_queue_enqueue.c
@@ -65,6 +65,8 @@  worker_thread_function(void *Arg)
       if (cl_event_is_ready(e) <= CL_COMPLETE) {
         list_node_del(&e->enqueue_node);
         list_add_tail(&ready_list, &e->enqueue_node);
+      } else if(!(queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){
+        break; /* in in-order mode, can't skip over non-ready events */
       }
     }
 
@@ -80,18 +82,20 @@  worker_thread_function(void *Arg)
     CL_OBJECT_UNLOCK(queue);
 
     /* Do the really job without lock.*/
-    exec_status = CL_SUBMITTED;
-    list_for_each_safe(pos, n, &ready_list)
-    {
-      e = list_entry(pos, _cl_event, enqueue_node);
-      cl_event_exec(e, exec_status, CL_FALSE);
-    }
+    if (queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { /* in in-order mode, need to get each all the way to CL_COMPLETE before starting the next one */
+      exec_status = CL_SUBMITTED;
+      list_for_each_safe(pos, n, &ready_list)
+      {
+        e = list_entry(pos, _cl_event, enqueue_node);
+        cl_event_exec(e, exec_status, CL_FALSE);
+      }
 
-    /* Notify all waiting for flush. */
-    CL_OBJECT_LOCK(queue);
-    worker->in_exec_status = CL_SUBMITTED;
-    CL_OBJECT_NOTIFY_COND(queue);
-    CL_OBJECT_UNLOCK(queue);
+      /* Notify all waiting for flush. */
+      CL_OBJECT_LOCK(queue);
+      worker->in_exec_status = CL_SUBMITTED;
+      CL_OBJECT_NOTIFY_COND(queue);
+      CL_OBJECT_UNLOCK(queue);
+    }
 
     list_for_each_safe(pos, n, &ready_list)
     {
--- a/src/cl_gl_api.c
+++ b/src/cl_gl_api.c
@@ -188,7 +188,7 @@  cl_int clEnqueueAcquireGLObjects (cl_com
     data = &e->exec_data;
     data->type = EnqueueReturnSuccesss;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {
@@ -274,7 +274,7 @@  cl_int clEnqueueReleaseGLObjects (cl_com
     data = &e->exec_data;
     data->type = EnqueueReturnSuccesss;
 
-    if (e_status == CL_COMPLETE) {
+    if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
       // Sync mode, no need to queue event.
       err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
       if (err != CL_SUCCESS) {

Comments

A demonstration that "in-order" queues currently aren't:

//g++ -o queue_order_test queue_order_test.c -lOpenCL
//Depends: beignet-opencl-icd ocl-icd-opencl-dev
#include <CL/cl.h>
#include <stdio.h>
int main()
{
  cl_int status;
  cl_device_id device;
clGetDeviceIDs(NULL,CL_DEVICE_TYPE_ALL,1,&device,NULL);
char device_name[101];
device_name[100]=0;
clGetDeviceInfo(device,CL_DEVICE_NAME,100,device_name,NULL);
printf("Using device %s",device_name);
cl_context ctx;
  cl_command_queue queue;
  cl_program program1,program2;
  cl_kernel kernel1,kernel2;
  cl_mem buffer;
  cl_event uevent1,uevent2,kernels_finished[2];
  size_t n = 3;
  cl_int test_data[3] = {3, 7, 5};
  const char* kernel1_source = "__kernel void test1(__global int *buf) {"
  "printf(\"kern1 \");"
  "  buf[get_global_id(0)] = 2* buf[get_global_id(0)];"
  "}";
  const char* kernel2_source = "__kernel void test2(__global int *buf) {"
  "printf(\"kern2 \");"
  "  buf[get_global_id(0)] = 9+ buf[get_global_id(0)];"
  "}";
  //Expected result: 15 23 19 if 1 runs first (in-order queue), 24 32 28 if 2 runs first (out-of-order queue)
  ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
  if(!ctx)
    return 1;

//cl_queue_properties qsettings[3]={CL_QUEUE_PROPERTIES,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,0};
cl_queue_properties qsettings[3]={CL_QUEUE_PROPERTIES,0,0};
queue = clCreateCommandQueueWithProperties(ctx, device, qsettings, &status);
//queue = clCreateCommandQueueWithProperties(ctx, device, 0, &status);
cl_command_queue_properties qp;
clGetCommandQueueInfo(queue,CL_QUEUE_PROPERTIES,sizeof(qp),&qp,NULL);
printf(" queue properties %i\n",qp);
program1 = clCreateProgramWithSource(ctx, 1, &kernel1_source, NULL, &status);
clBuildProgram(program1, 1, &device, "", NULL, NULL);
kernel1 = clCreateKernel(program1, "test1", &status);
program2 = clCreateProgramWithSource(ctx, 1, &kernel2_source, NULL, &status);
clBuildProgram(program2, 1, &device, "", NULL, NULL);
kernel2 = clCreateKernel(program2, "test2", &status);
buffer = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n*4, test_data, &status);
uevent1=clCreateUserEvent(ctx,&status);
uevent2=clCreateUserEvent(ctx,&status);
clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buffer);
clSetKernelArg(kernel2, 0, sizeof(cl_mem), &buffer);
clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, &n, &n, 1,&uevent1, &kernels_finished[0]);
clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, &n, &n, 0,NULL, &kernels_finished[1]);//without uevent2, bypasses queue
//clEnqueueNDRangeKernel(queue, kernel2, 1, NULL, &n, &n, 1,&uevent2, &kernels_finished[1]);
clSetUserEventStatus(uevent2,CL_COMPLETE);
printf("\nsetting event %p (others %p %p) - enter a number\n",uevent1,kernels_finished[0],kernels_finished[1]);
int j;scanf("%i",&j);
clSetUserEventStatus(uevent1,CL_COMPLETE);
clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, n*4, test_data, 2, kernels_finished, NULL);
printf("\nresult: %i %i %i\n",test_data[0],test_data[1],test_data[2]);
}