clray
view src/ocl.cc @ 32:4cf4919c3812
performance sucks
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Tue, 24 Aug 2010 05:43:57 +0100 |
parents | 97cfd9675310 |
children | 980bc07be868 |
line source
1 #define OCL_CC_
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <stdarg.h>
7 #include <errno.h>
8 #ifndef _MSC_VER
9 #include <alloca.h>
10 #else
11 #include <malloc.h>
12 #endif
13 #include <sys/stat.h>
14 #include "ocl.h"
15 #include "ocl_errstr.h"
18 class InitCL {
19 public:
20 InitCL();
21 };
23 struct device_info {
24 cl_device_id id;
25 cl_device_type type;
26 unsigned int units;
27 unsigned int clock;
29 unsigned int dim;
30 size_t *work_item_sizes;
31 size_t work_group_size;
33 unsigned long mem_size;
34 };
36 static bool init_opencl(void);
37 static int select_device(struct device_info *di, int (*devcmp)(struct device_info*, struct device_info*));
38 static int get_dev_info(cl_device_id dev, struct device_info *di);
39 static int devcmp(struct device_info *a, struct device_info *b);
40 static const char *devtypestr(cl_device_type type);
41 static void print_memsize(FILE *out, unsigned long memsz);
42 static const char *clstrerror(int err);
45 static InitCL initcl;
46 static cl_context ctx;
47 static cl_command_queue cmdq;
48 static device_info devinf;
50 InitCL::InitCL()
51 {
52 if(!init_opencl()) {
53 exit(0);
54 }
55 }
57 static bool init_opencl(void)
58 {
59 if(select_device(&devinf, devcmp) == -1) {
60 return false;
61 }
64 if(!(ctx = clCreateContext(0, 1, &devinf.id, 0, 0, 0))) {
65 fprintf(stderr, "failed to create opencl context\n");
66 return false;
67 }
69 if(!(cmdq = clCreateCommandQueue(ctx, devinf.id, 0, 0))) {
70 fprintf(stderr, "failed to create command queue\n");
71 return false;
72 }
73 return true;
74 }
77 CLMemBuffer *create_mem_buffer(int rdwr, size_t sz, const void *buf)
78 {
79 int err;
80 cl_mem mem;
81 cl_mem_flags flags = rdwr | CL_MEM_ALLOC_HOST_PTR;
83 if(buf) {
84 flags |= CL_MEM_COPY_HOST_PTR;
85 }
88 if(!(mem = clCreateBuffer(ctx, flags, sz, (void*)buf, &err))) {
89 fprintf(stderr, "failed to create memory buffer: %s\n", clstrerror(err));
90 return 0;
91 }
93 CLMemBuffer *mbuf = new CLMemBuffer;
94 mbuf->mem = mem;
95 mbuf->size = sz;
96 mbuf->ptr = 0;
97 return mbuf;
98 }
100 void destroy_mem_buffer(CLMemBuffer *mbuf)
101 {
102 if(mbuf) {
103 clReleaseMemObject(mbuf->mem);
104 delete mbuf;
105 }
106 }
108 void *map_mem_buffer(CLMemBuffer *mbuf, int rdwr)
109 {
110 if(!mbuf) return 0;
112 #ifndef NDEBUG
113 if(mbuf->ptr) {
114 fprintf(stderr, "WARNING: map_mem_buffer called on already mapped buffer\n");
115 }
116 #endif
118 int err;
119 mbuf->ptr = clEnqueueMapBuffer(cmdq, mbuf->mem, 1, rdwr, 0, mbuf->size, 0, 0, 0, &err);
120 if(!mbuf->ptr) {
121 fprintf(stderr, "failed to map buffer: %s\n", clstrerror(err));
122 return 0;
123 }
124 return mbuf->ptr;
125 }
127 void unmap_mem_buffer(CLMemBuffer *mbuf)
128 {
129 if(!mbuf || !mbuf->ptr) return;
130 clEnqueueUnmapMemObject(cmdq, mbuf->mem, mbuf->ptr, 0, 0, 0);
131 mbuf->ptr = 0;
132 }
134 bool write_mem_buffer(CLMemBuffer *mbuf, size_t sz, const void *src)
135 {
136 if(!mbuf) return false;
138 int err;
139 if((err = clEnqueueWriteBuffer(cmdq, mbuf->mem, 1, 0, sz, src, 0, 0, 0)) != 0) {
140 fprintf(stderr, "failed to write buffer: %s\n", clstrerror(err));
141 return false;
142 }
143 return true;
144 }
146 bool read_mem_buffer(CLMemBuffer *mbuf, size_t sz, void *dest)
147 {
148 if(!mbuf) return false;
150 int err;
151 if((err = clEnqueueReadBuffer(cmdq, mbuf->mem, 1, 0, sz, dest, 0, 0, 0)) != 0) {
152 fprintf(stderr, "failed to read buffer: %s\n", clstrerror(err));
153 return false;
154 }
155 return true;
156 }
159 CLArg::CLArg()
160 {
161 memset(this, 0, sizeof *this);
162 }
165 CLProgram::CLProgram(const char *kname)
166 {
167 prog = 0;
168 kernel = 0;
169 this->kname = kname;
170 args.resize(16);
171 built = false;
172 }
174 CLProgram::~CLProgram()
175 {
176 if(prog) {
178 clReleaseProgram(prog);
179 }
180 if(kernel) {
182 clReleaseKernel(kernel);
183 }
184 for(size_t i=0; i<args.size(); i++) {
185 if(args[i].type == ARGTYPE_MEM_BUF) {
186 destroy_mem_buffer(args[i].v.mbuf);
187 }
188 }
189 }
191 bool CLProgram::load(const char *fname)
192 {
193 FILE *fp;
194 char *src;
195 struct stat st;
197 printf("loading opencl program (%s)\n", fname);
199 if(!(fp = fopen(fname, "rb"))) {
200 fprintf(stderr, "failed to open %s: %s\n", fname, strerror(errno));
201 return false;
202 }
204 fstat(fileno(fp), &st);
206 src = new char[st.st_size + 1];
208 fread(src, 1, st.st_size, fp);
209 src[st.st_size] = 0;
210 fclose(fp);
213 if(!(prog = clCreateProgramWithSource(ctx, 1, (const char**)&src, 0, 0))) {
214 fprintf(stderr, "error creating program object: %s\n", fname);
215 delete [] src;
216 return false;
217 }
218 delete [] src;
219 return true;
220 }
222 bool CLProgram::set_argi(int idx, int val)
223 {
224 if((int)args.size() <= idx) {
225 args.resize(idx + 1);
226 }
228 CLArg *arg = &args[idx];
229 arg->type = ARGTYPE_INT;
230 arg->v.ival = val;
231 return true;
232 }
234 bool CLProgram::set_argf(int idx, float val)
235 {
236 if((int)args.size() <= idx) {
237 args.resize(idx + 1);
238 }
240 CLArg *arg = &args[idx];
241 arg->type = ARGTYPE_FLOAT;
242 arg->v.fval = val;
243 return true;
244 }
246 bool CLProgram::set_arg_buffer(int idx, int rdwr, size_t sz, const void *ptr)
247 {
248 printf("create argument %d buffer: %d bytes\n", idx, (int)sz);
249 CLMemBuffer *buf;
251 if(sz <= 0 || !(buf = create_mem_buffer(rdwr, sz, ptr))) {
252 fprintf(stderr, "invalid size while creating argument buffer %d: %d\n", idx, (int)sz);
253 return false;
254 }
256 if((int)args.size() <= idx) {
257 args.resize(idx + 1);
258 }
259 args[idx].type = ARGTYPE_MEM_BUF;
260 args[idx].v.mbuf = buf;
261 return true;
262 }
264 CLMemBuffer *CLProgram::get_arg_buffer(int arg)
265 {
266 if(arg < 0 || arg >= (int)args.size() || args[arg].type != ARGTYPE_MEM_BUF) {
267 return 0;
268 }
269 return args[arg].v.mbuf;
270 }
272 int CLProgram::get_num_args() const
273 {
274 int num_args = 0;
275 for(size_t i=0; i<args.size(); i++) {
276 if(args[i].type != ARGTYPE_NONE) {
277 num_args++;
278 }
279 }
280 return num_args;
281 }
283 bool CLProgram::build()
284 {
285 int err;
287 if((err = clBuildProgram(prog, 0, 0, 0, 0, 0)) != 0) {
288 size_t sz;
289 clGetProgramBuildInfo(prog, devinf.id, CL_PROGRAM_BUILD_LOG, 0, 0, &sz);
291 char *errlog = (char*)alloca(sz + 1);
292 clGetProgramBuildInfo(prog, devinf.id, CL_PROGRAM_BUILD_LOG, sz, errlog, 0);
293 fprintf(stderr, "failed to build program: %s\n%s\n", clstrerror(err), errlog);
295 clReleaseProgram(prog);
296 prog = 0;
297 return false;
298 }
301 if(!(kernel = clCreateKernel(prog, kname.c_str(), 0))) {
302 fprintf(stderr, "failed to create kernel: %s\n", kname.c_str());
303 clReleaseProgram(prog);
304 prog = 0;
305 return false;
306 }
308 for(size_t i=0; i<args.size(); i++) {
309 int err;
311 if(args[i].type == ARGTYPE_NONE) {
312 break;
313 }
315 switch(args[i].type) {
316 case ARGTYPE_INT:
317 if((err = clSetKernelArg(kernel, i, sizeof(int), &args[i].v.ival)) != 0) {
318 fprintf(stderr, "failed to bind kernel argument %d: %s\n", (int)i, clstrerror(err));
319 goto fail;
320 }
321 break;
323 case ARGTYPE_FLOAT:
324 if((err = clSetKernelArg(kernel, i, sizeof(float), &args[i].v.fval)) != 0) {
325 fprintf(stderr, "failed to bind kernel argument %d: %s\n", (int)i, clstrerror(err));
326 goto fail;
327 }
328 break;
330 case ARGTYPE_MEM_BUF:
331 {
332 CLMemBuffer *mbuf = args[i].v.mbuf;
334 if((err = clSetKernelArg(kernel, i, sizeof mbuf->mem, &mbuf->mem)) != 0) {
335 fprintf(stderr, "failed to bind kernel argument %d: %s\n", (int)i, clstrerror(err));
336 goto fail;
337 }
338 }
339 break;
341 default:
342 break;
343 }
344 }
346 built = true;
347 return true;
349 fail:
350 clReleaseProgram(prog);
351 clReleaseKernel(kernel);
352 prog = 0;
353 kernel = 0;
354 return false;
355 }
357 bool CLProgram::run() const
358 {
359 return run(1, 1);
360 }
362 bool CLProgram::run(int dim, ...) const
363 {
364 if(!built) {
365 if(!((CLProgram*)this)->build()) {
366 return false;
367 }
368 }
370 va_list ap;
371 size_t *global_size = (size_t*)alloca(dim * sizeof *global_size);
373 va_start(ap, dim);
374 for(int i=0; i<dim; i++) {
375 global_size[i] = va_arg(ap, int);
376 }
377 va_end(ap);
379 int err;
380 cl_event event;
382 if((err = clEnqueueNDRangeKernel(cmdq, kernel, dim, 0, global_size, 0, 0, 0, &event)) != 0) {
383 fprintf(stderr, "error executing kernel: %s\n", clstrerror(err));
384 return false;
385 }
387 clWaitForEvents(1, &event);
388 clReleaseEvent(event);
389 return true;
390 }
392 static int select_device(struct device_info *dev_inf, int (*devcmp)(struct device_info*, struct device_info*))
393 {
394 unsigned int i, j, num_dev, num_plat, sel, ret;
395 cl_device_id dev[32];
396 cl_platform_id plat[32];
398 dev_inf->work_item_sizes = 0;
400 if((ret = clGetPlatformIDs(32, plat, &num_plat)) != 0) {
401 fprintf(stderr, "clGetPlatformIDs failed: %s\n", clstrerror(ret));
402 return -1;
403 }
404 if(!num_plat) {
405 fprintf(stderr, "OpenCL not available!\n");
406 return -1;
407 }
409 for(i=0; i<num_plat; i++) {
410 char buf[512];
412 clGetPlatformInfo(plat[i], CL_PLATFORM_NAME, sizeof buf, buf, 0);
413 printf("[%d]: %s", i, buf);
414 clGetPlatformInfo(plat[i], CL_PLATFORM_VENDOR, sizeof buf, buf, 0);
415 printf(", %s", buf);
416 clGetPlatformInfo(plat[i], CL_PLATFORM_VERSION, sizeof buf, buf, 0);
417 printf(" (%s)\n", buf);
418 }
420 if((ret = clGetDeviceIDs(plat[0], CL_DEVICE_TYPE_ALL, 32, dev, &num_dev)) != 0) {
421 fprintf(stderr, "clGetDeviceIDs failed: %s\n", clstrerror(ret));
422 return -1;
423 }
424 printf("found %d cl devices.\n", num_dev);
426 for(i=0; i<num_dev; i++) {
427 struct device_info di;
429 if(get_dev_info(dev[i], &di) == -1) {
430 free(dev_inf->work_item_sizes);
431 return -1;
432 }
434 printf("--> device %u (%s)\n", i, devtypestr(di.type));
435 printf("max compute units: %u\n", di.units);
436 printf("max clock frequency: %u\n", di.clock);
437 printf("max work item dimensions: %u\n", di.dim);
439 printf("max work item sizes: ");
440 for(j=0; j<di.dim; j++) {
441 printf("%u", (unsigned int)di.work_item_sizes[j]);
442 if(di.dim - j > 1) {
443 printf(", ");
444 }
445 }
446 putchar('\n');
448 printf("max work group size: %u\n", (unsigned int)di.work_group_size);
449 printf("max object allocation size: ");
450 print_memsize(stdout, di.mem_size);
451 putchar('\n');
453 if(devcmp(&di, dev_inf) > 0) {
454 free(dev_inf->work_item_sizes);
455 memcpy(dev_inf, &di, sizeof di);
456 sel = i;
457 }
458 }
460 if(num_dev) {
461 printf("\nusing device: %d\n", sel);
462 return 0;
463 }
465 return -1;
466 }
468 static int get_dev_info(cl_device_id dev, struct device_info *di)
469 {
470 di->id = dev;
473 clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof di->type, &di->type, 0);
474 clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof di->units, &di->units, 0);
475 clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof di->clock, &di->clock, 0);
476 clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof di->dim, &di->dim, 0);
478 di->work_item_sizes = new size_t[di->dim];
480 clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, di->dim * sizeof *di->work_item_sizes, di->work_item_sizes, 0);
481 clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof di->work_group_size, &di->work_group_size, 0);
482 clGetDeviceInfo(dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof di->mem_size, &di->mem_size, 0);
484 return 0;
485 }
487 static int devcmp(struct device_info *a, struct device_info *b)
488 {
489 unsigned int aval = a->units * a->clock;
490 unsigned int bval = b->units * b->clock;
492 return aval - bval;
493 }
495 static const char *devtypestr(cl_device_type type)
496 {
497 switch(type) {
498 case CL_DEVICE_TYPE_CPU:
499 return "cpu";
500 case CL_DEVICE_TYPE_GPU:
501 return "gpu";
502 case CL_DEVICE_TYPE_ACCELERATOR:
503 return "accelerator";
504 default:
505 break;
506 }
507 return "unknown";
508 }
510 static void print_memsize(FILE *out, unsigned long bytes)
511 {
512 int i;
513 unsigned long memsz = bytes;
514 const char *suffix[] = {"bytes", "kb", "mb", "gb", "tb", "pb", 0};
516 for(i=0; suffix[i]; i++) {
517 if(memsz < 1024) {
518 fprintf(out, "%lu %s", memsz, suffix[i]);
519 if(i > 0) {
520 fprintf(out, " (%lu bytes)", bytes);
521 }
522 return;
523 }
525 memsz /= 1024;
526 }
527 }
529 static const char *clstrerror(int err)
530 {
531 if(err > 0) {
532 return "<invalid error code>";
533 }
534 if(err <= -(int)(sizeof ocl_errstr / sizeof *ocl_errstr)) {
535 return "<unknown error>";
536 }
537 return ocl_errstr[-err];
538 }