Floating Pointers on OpenCL cl_mem

OpenCLHi, First welcome in 2018 on my blog, I hope you all feel exited in the new year like me, sorry to be silent for a while, but I was absorbing my first GitHub fork wich is “sowson/darknet” and I was coding like a crazy one. Still I have several todo on this code but I really like it. And regarding to this journey, Today I would like to share with you something really awesome that I was looking on the entire Internet and GitHub as well. The problem I was trying to solve is I thing well known limitation of the OpenCL that CUDA does not have when you use C language for low-level development. The thing is that in CUDA you can have on variable_gpu that is float * type following floating point operations. What does it mean? It means that you can use + or – or += or -= operators to change the address and access to different part of memory.

variable_gpu += 20;

or

variable_pointed_gpu = variable_gpu + 20;

That above examples can be easily achieved with CUDA, but OpenCL uses cl_mem abstraction and only in kernels code you can use float*, and in C implementation you have not too many choices, but fortunately I found one very good tip that there is function named clCreateSubBuffer and I decided to use this function, but first I extended the classical cl_mem as follows in header file.

typedef struct _cl_mem_ext cl_mem_ext;

typedef struct _cl_mem_ext {
    cl_mem mem;
    cl_mem org;
    size_t len;
    size_t off;
    size_t obs;
    size_t cnt;
    cl_mem_ext (*inc) (cl_mem_ext dat, int inc, size_t len);
    cl_mem_ext (*dec) (cl_mem_ext dat, int dec, size_t len);
    cl_mem_ext (*add) (cl_mem_ext dat, int add, size_t len);
    cl_mem_ext (*rem) (cl_mem_ext dat, int rem, size_t len);
} cl_mem_ext;

cl_mem_ext inc(cl_mem_ext buf, int inc, size_t len);
cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len);
cl_mem_ext mov(cl_mem_ext buf, size_t len);
cl_mem_ext add(cl_mem_ext buf, int inc, size_t len);
cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len);
cl_mem_ext upd(cl_mem_ext buf, size_t len);

There are many elements in the extension I would like to propose, but I thought about that very carefully and I need all of them. Now I would like to share with you possible usages of that abstractions that includes create and release of cl_mem_ext and all you need to do is to use that implementation in c file and rename all places where you have cl_mem to use cl_mem_ext instead. The implementation is as follows.

cl_mem_ext opencl_make_array(float *x, size_t n)
{
    cl_mem_ext buf;

    buf.len = n;
    buf.obs = sizeof(cl_float);
    buf.off = 0;
    buf.cnt = 0;

    buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE,
                             buf.len * buf.obs, NULL, NULL);

    buf.mem = buf.org;

    buf.inc = inc;
    buf.dec = dec;
    buf.add = add;
    buf.rem = rem;

    if (x != NULL)
        opencl_push_array(buf, x, n);
    else
    {
        float *cptr = (float*) calloc(n * sizeof(float), 1);
        if (cptr != NULL)
            opencl_push_array(buf, cptr, n);
        free(cptr);
    }

    return buf;
}

cl_mem_ext opencl_make_int_array(size_t n)
{
    cl_mem_ext buf;

    buf.len = n;
    buf.obs = sizeof(cl_int);
    buf.off = 0;
    buf.cnt = 0;

    buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE,
                             buf.len * buf.obs, NULL, NULL);

    buf.mem = buf.org;

    buf.inc = inc;
    buf.dec = dec;
    buf.add = add;
    buf.rem = rem;

    return buf;
}

void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n)
{
    cl_int clErr = clEnqueueWriteBuffer(opencl_queue, x_gpu.mem, CL_TRUE, 0,
                   (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL);

    if (clErr != CL_SUCCESS)
        printf("Could not push array to device. Error code %d\n", clErr);
}

void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n)
{
    cl_int clErr = clEnqueueReadBuffer(opencl_queue, x_gpu.mem, CL_TRUE, 0,
                   (n - x_gpu.off) * x_gpu.obs, x, 0, NULL, NULL);

    if (clErr != CL_SUCCESS)
        printf("Could not pull array from device. Error code %d\n", clErr);
}

void opencl_free(cl_mem_ext x_gpu)
{
    x_gpu.len = 0;
    x_gpu.obs = 0;
    clReleaseMemObject(x_gpu.org);
    x_gpu.mem = 0;
    x_gpu.off = 0;
    x_gpu.cnt = 0;
    x_gpu.inc = 0;
    x_gpu.dec = 0;
    x_gpu.add = 0;
    x_gpu.rem = 0;
}

cl_mem_ext inc(cl_mem_ext buf, int inc, size_t len) {
    if (buf.len == 0) return buf;
    buf.off += inc;
    buf.cnt += 1;
    return mov(buf, len);
}

cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len) {
    if (buf.len == 0) return buf;
    buf.off -= dec;
    buf.cnt -= 1;
    return mov(buf, len);
}

cl_mem_ext mov(cl_mem_ext buf, size_t len) {
    if (buf.len == 0) return buf;

    cl_buffer_region region;

    region.origin = buf.off * buf.obs;
    region.size = len != 0 ? len * buf.obs : (buf.len - buf.off) * buf.obs;

    cl_int err = 0;
    buf.mem = clCreateSubBuffer(
              buf.org, CL_MEM_READ_WRITE,
              CL_BUFFER_CREATE_TYPE_REGION, &region, &err);

    if (err != CL_SUCCESS)
    {
        printf("Could not sub buffer from device. Error code %d\n", err);
    }

    return buf;
}

cl_mem_ext add(cl_mem_ext buf, int inc, size_t len) {
    if (buf.len == 0) return buf;
    buf.off = inc;
    buf.cnt = 1;
    return upd(buf, len);
}

cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len) {
    if (buf.len == 0) return buf;
    buf.off = dec;
    buf.cnt = 1;
    return upd(buf, len);
}

cl_mem_ext upd(cl_mem_ext buf, size_t len) {
    if (buf.len == 0) return buf;

    cl_mem_ext ret;

    ret.org = buf.org;

    ret.len = buf.len;
    ret.obs = buf.obs;
    ret.org = buf.org;
    ret.off = buf.off;
    ret.cnt = buf.cnt;

    cl_buffer_region region;

    region.origin = ret.off * ret.obs;
    region.size = len != 0 ? len * ret.obs : (ret.len - ret.off) * ret.obs;

    cl_int err = 0;
    ret.mem = clCreateSubBuffer(
        ret.org, CL_MEM_READ_WRITE,
        CL_BUFFER_CREATE_TYPE_REGION, &region, &err);

    if (err != CL_SUCCESS)
    {
        printf("Could not sub buffer from device. Error code %d\n", err);
    }

    return ret;
}

And now instead of examples in CUDA in OpenCL you can do following implementation.

variable_gpu.inc(variable_gpu, 20, 20);

or

variable_pointed_gpu = variable_gpu.add(variable_gpu, 20, 20);

Thanks for reading,

p ;).

Leave a Reply

Your email address will not be published. Required fields are marked *

*

This site uses Akismet to reduce spam. Learn how your comment data is processed.