/* adding two vectors */

    #include <stdio.h>
    #include <strings.h>
    #include <stdcl.h>

    #define SIZE 1024

    int main()
    {
        int i;

        CONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

        void* clh = clopen(cp, "add_vec.cl",CLLD_NOW);
        cl_kernel k_addvec = clsym(cp, clh, "addvec_kern", CLLD_NOW);

        float* aa = (float*)clmalloc(cp,SIZE*sizeof(float),0);
        float* bb = (float*)clmalloc(cp,SIZE*sizeof(float),0);
        float* cc = (float*)clmalloc(cp,SIZE*sizeof(float),0);

        for(i=0;i<SIZE;i++) {
            aa[i] = 111.0f * i;
            bb[i] = 222.0f * i;
        }

        bzero(cc,SIZE*sizeof(float));

        clndrange_t ndr = clndrange_init1d(0,SIZE,64);

        clmsync(cp,0,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
        clmsync(cp,0,bb,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

        clarg_set_global(cp,k_addvec,0,aa);
        clarg_set_global(cp,k_addvec,1,bb);
        clarg_set_global(cp,k_addvec,2,cc);

        clfork(cp,0,k_addvec,&ndr,CL_EVENT_NOWAIT);

        clmsync(cp,0,cc,CL_MEM_HOST|CL_EVENT_NOWAIT);

        clwait(cp,0,CL_MEM_EVENT|CL_KERNEL_EVENT|CL_EVENT_RELEASE);

        for(i=0;i<SIZE;i++) printf("%f %f %f\n",aa[i],bb[i],cc[i]);

        if (aa) clfree(aa);
        if (bb) clfree(bb);
        if (cc) clfree(cc);

        clclose(cp,clh);
    }