clray

diff src/ocl.cc @ 0:5767277e049f

first test works, let's try to make a raytracer now...
author John Tsiombikas <nuclear@member.fsf.org>
date Mon, 12 Jul 2010 05:56:47 +0300
parents
children 0b0e4d18d53f
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/ocl.cc	Mon Jul 12 05:56:47 2010 +0300
     1.3 @@ -0,0 +1,399 @@
     1.4 +#include <stdio.h>
     1.5 +#include <stdlib.h>
     1.6 +#include <string.h>
     1.7 +#include <errno.h>
     1.8 +#include <alloca.h>
     1.9 +#include <sys/stat.h>
    1.10 +#include "ocl.h"
    1.11 +
    1.12 +
    1.13 +class InitCL {
    1.14 +public:
    1.15 +	InitCL();
    1.16 +};
    1.17 +
    1.18 +struct device_info {
    1.19 +	cl_device_id id;
    1.20 +	cl_device_type type;
    1.21 +	unsigned int units;
    1.22 +	unsigned int clock;
    1.23 +
    1.24 +	unsigned int dim;
    1.25 +	size_t *work_item_sizes;
    1.26 +	size_t work_group_size;
    1.27 +
    1.28 +	unsigned long mem_size;
    1.29 +};
    1.30 +
    1.31 +static bool init_opencl(void);
    1.32 +static int select_device(struct device_info *di, int (*devcmp)(struct device_info*, struct device_info*));
    1.33 +static int get_dev_info(cl_device_id dev, struct device_info *di);
    1.34 +static int devcmp(struct device_info *a, struct device_info *b);
    1.35 +static const char *devtypestr(cl_device_type type);
    1.36 +static void print_memsize(FILE *out, unsigned long memsz);
    1.37 +
    1.38 +
    1.39 +static InitCL initcl;
    1.40 +static cl_context ctx;
    1.41 +static cl_command_queue cmdq;
    1.42 +static device_info devinf;
    1.43 +
    1.44 +InitCL::InitCL()
    1.45 +{
    1.46 +	if(!init_opencl()) {
    1.47 +		exit(0);
    1.48 +	}
    1.49 +}
    1.50 +
    1.51 +static bool init_opencl(void)
    1.52 +{
    1.53 +	if(select_device(&devinf, devcmp) == -1) {
    1.54 +		return false;
    1.55 +	}
    1.56 +
    1.57 +
    1.58 +	if(!(ctx = clCreateContext(0, 1, &devinf.id, 0, 0, 0))) {
    1.59 +		fprintf(stderr, "failed to create opencl context\n");
    1.60 +		return false;
    1.61 +	}
    1.62 +
    1.63 +	if(!(cmdq = clCreateCommandQueue(ctx, devinf.id, 0, 0))) {
    1.64 +		fprintf(stderr, "failed to create command queue\n");
    1.65 +		return false;
    1.66 +	}
    1.67 +	return true;
    1.68 +}
    1.69 +
    1.70 +
    1.71 +CLMemBuffer *create_mem_buffer(int rdwr, size_t sz, void *buf)
    1.72 +{
    1.73 +	int err;
    1.74 +	cl_mem mem;
    1.75 +
    1.76 +
    1.77 +	if(!(mem = clCreateBuffer(ctx, rdwr | CL_MEM_USE_HOST_PTR, sz, buf, &err))) {
    1.78 +		fprintf(stderr, "failed to create memory buffer (%d)\n", err);
    1.79 +		return 0;
    1.80 +	}
    1.81 +
    1.82 +	CLMemBuffer *mbuf = new CLMemBuffer;
    1.83 +	mbuf->mem = mem;
    1.84 +	mbuf->size = sz;
    1.85 +	return mbuf;
    1.86 +}
    1.87 +
    1.88 +void destroy_mem_buffer(CLMemBuffer *mbuf)
    1.89 +{
    1.90 +	if(mbuf) {
    1.91 +
    1.92 +		clReleaseMemObject(mbuf->mem);
    1.93 +		delete mbuf;
    1.94 +	}
    1.95 +}
    1.96 +
    1.97 +void *map_mem_buffer(CLMemBuffer *mbuf, int rdwr)
    1.98 +{
    1.99 +	if(!mbuf) return 0;
   1.100 +
   1.101 +	int err;
   1.102 +	mbuf->ptr = clEnqueueMapBuffer(cmdq, mbuf->mem, 1, rdwr, 0, mbuf->size, 0, 0, 0, &err);
   1.103 +	if(!mbuf->ptr) {
   1.104 +		fprintf(stderr, "failed to map buffer (%d)\n", err);
   1.105 +		return 0;
   1.106 +	}
   1.107 +	return mbuf->ptr;
   1.108 +}
   1.109 +
   1.110 +void unmap_mem_buffer(CLMemBuffer *mbuf)
   1.111 +{
   1.112 +	if(!mbuf || !mbuf->ptr) return;
   1.113 +	clEnqueueUnmapMemObject(cmdq, mbuf->mem, mbuf->ptr, 0, 0, 0);
   1.114 +}
   1.115 +
   1.116 +bool write_mem_buffer(CLMemBuffer *mbuf, size_t sz, void *src)
   1.117 +{
   1.118 +	if(!mbuf) return false;
   1.119 +
   1.120 +	int err;
   1.121 +	if((err = clEnqueueWriteBuffer(cmdq, mbuf->mem, 1, 0, sz, src, 0, 0, 0)) != 0) {
   1.122 +		fprintf(stderr, "failed to write buffer (%d)\n", err);
   1.123 +		return false;
   1.124 +	}
   1.125 +	return true;
   1.126 +}
   1.127 +
   1.128 +bool read_mem_buffer(CLMemBuffer *mbuf, size_t sz, void *dest)
   1.129 +{
   1.130 +	if(!mbuf) return false;
   1.131 +
   1.132 +	int err;
   1.133 +	if((err = clEnqueueReadBuffer(cmdq, mbuf->mem, 1, 0, sz, dest, 0, 0, 0)) != 0) {
   1.134 +		fprintf(stderr, "failed to read buffer (%d)\n", err);
   1.135 +		return false;
   1.136 +	}
   1.137 +	return true;
   1.138 +}
   1.139 +
   1.140 +
   1.141 +CLProgram::CLProgram(const char *kname)
   1.142 +{
   1.143 +	prog = 0;
   1.144 +	kernel = 0;
   1.145 +	this->kname = kname;
   1.146 +	mbuf.resize(16);
   1.147 +	built = false;
   1.148 +}
   1.149 +
   1.150 +CLProgram::~CLProgram()
   1.151 +{
   1.152 +	if(prog) {
   1.153 +
   1.154 +		clReleaseProgram(prog);
   1.155 +	}
   1.156 +	if(kernel) {
   1.157 +
   1.158 +		clReleaseKernel(kernel);
   1.159 +	}
   1.160 +	for(size_t i=0; i<mbuf.size(); i++) {
   1.161 +		if(mbuf[i]) {
   1.162 +			destroy_mem_buffer(mbuf[i]);
   1.163 +		}
   1.164 +	}
   1.165 +}
   1.166 +
   1.167 +bool CLProgram::load(const char *fname)
   1.168 +{
   1.169 +	FILE *fp;
   1.170 +	char *src;
   1.171 +	struct stat st;
   1.172 +
   1.173 +	printf("loading opencl program (%s)\n", fname);
   1.174 +
   1.175 +	if(!(fp = fopen(fname, "rb"))) {
   1.176 +		fprintf(stderr, "failed to open %s: %s\n", fname, strerror(errno));
   1.177 +		return false;
   1.178 +	}
   1.179 +
   1.180 +	fstat(fileno(fp), &st);
   1.181 +
   1.182 +	src = new char[st.st_size + 1];
   1.183 +
   1.184 +	fread(src, 1, st.st_size, fp);
   1.185 +	src[st.st_size] = 0;
   1.186 +	fclose(fp);
   1.187 +
   1.188 +
   1.189 +	if(!(prog = clCreateProgramWithSource(ctx, 1, (const char**)&src, 0, 0))) {
   1.190 +		fprintf(stderr, "error creating program object: %s\n", fname);
   1.191 +		delete [] src;
   1.192 +		return false;
   1.193 +	}
   1.194 +	delete [] src;
   1.195 +	return true;
   1.196 +}
   1.197 +
   1.198 +bool CLProgram::set_arg(int arg, int rdwr, size_t sz, void *ptr)
   1.199 +{
   1.200 +	CLMemBuffer *buf;
   1.201 +
   1.202 +	if(!(buf = create_mem_buffer(rdwr, sz, ptr))) {
   1.203 +		return false;
   1.204 +	}
   1.205 +
   1.206 +	if((int)mbuf.size() <= arg) {
   1.207 +		mbuf.resize(arg + 1);
   1.208 +	}
   1.209 +	mbuf[arg] = buf;
   1.210 +	return true;
   1.211 +}
   1.212 +
   1.213 +CLMemBuffer *CLProgram::get_arg_buffer(int arg)
   1.214 +{
   1.215 +	if(arg < 0 || arg >= (int)mbuf.size()) {
   1.216 +		return 0;
   1.217 +	}
   1.218 +	return mbuf[arg];
   1.219 +}
   1.220 +
   1.221 +bool CLProgram::build()
   1.222 +{
   1.223 +	char errlog[512];
   1.224 +
   1.225 +
   1.226 +	if(clBuildProgram(prog, 0, 0, 0, 0, 0) != 0) {
   1.227 +		clGetProgramBuildInfo(prog, devinf.id, CL_PROGRAM_BUILD_LOG, sizeof errlog, errlog, 0);
   1.228 +		fprintf(stderr, "failed to build program:\n%s\n", errlog);
   1.229 +		clReleaseProgram(prog);
   1.230 +		prog = 0;
   1.231 +		return false;
   1.232 +	}
   1.233 +
   1.234 +
   1.235 +	if(!(kernel = clCreateKernel(prog, kname.c_str(), 0))) {
   1.236 +		fprintf(stderr, "failed to create kernel: %s\n", kname.c_str());
   1.237 +		clReleaseProgram(prog);
   1.238 +		prog = 0;
   1.239 +		return false;
   1.240 +	}
   1.241 +
   1.242 +	for(size_t i=0; i<mbuf.size(); i++) {
   1.243 +		if(!mbuf[i]) break;
   1.244 +
   1.245 +		int err;
   1.246 +		if((err = clSetKernelArg(kernel, i, sizeof mbuf[i]->mem, &mbuf[i]->mem)) != 0) {
   1.247 +			fprintf(stderr, "failed to bind kernel argument: %d (%d)\n", (int)i, err);
   1.248 +			clReleaseProgram(prog);
   1.249 +			clReleaseKernel(kernel);
   1.250 +			prog = 0;
   1.251 +			kernel = 0;
   1.252 +			return false;
   1.253 +		}
   1.254 +	}
   1.255 +
   1.256 +	built = true;
   1.257 +	return true;
   1.258 +}
   1.259 +
   1.260 +bool CLProgram::run() const
   1.261 +{
   1.262 +	return run(1, 1);
   1.263 +}
   1.264 +
   1.265 +bool CLProgram::run(int dim, ...) const
   1.266 +{
   1.267 +	if(!built) {
   1.268 +		if(!((CLProgram*)this)->build()) {
   1.269 +			return false;
   1.270 +		}
   1.271 +	}
   1.272 +
   1.273 +	va_list ap;
   1.274 +	size_t *global_size = (size_t*)alloca(dim * sizeof *global_size);
   1.275 +
   1.276 +	va_start(ap, dim);
   1.277 +	for(int i=0; i<dim; i++) {
   1.278 +		global_size[i] = va_arg(ap, int);
   1.279 +	}
   1.280 +	va_end(ap);
   1.281 +
   1.282 +	int err;
   1.283 +	if((err = clEnqueueNDRangeKernel(cmdq, kernel, dim, 0, global_size, 0, 0, 0, 0)) != 0) {
   1.284 +		fprintf(stderr, "error executing kernel (%d)\n", err);
   1.285 +		return false;
   1.286 +	}
   1.287 +	return true;
   1.288 +}
   1.289 +
   1.290 +static int select_device(struct device_info *dev_inf, int (*devcmp)(struct device_info*, struct device_info*))
   1.291 +{
   1.292 +	unsigned int i, j, num_dev, sel;
   1.293 +	cl_device_id dev[32];
   1.294 +
   1.295 +	dev_inf->work_item_sizes = 0;
   1.296 +
   1.297 +
   1.298 +	clGetDeviceIDs(0, CL_DEVICE_TYPE_ALL, 32, dev, &num_dev);
   1.299 +	printf("found %d cl devices.\n", num_dev);
   1.300 +
   1.301 +	for(i=0; i<num_dev; i++) {
   1.302 +		struct device_info di;
   1.303 +
   1.304 +		if(get_dev_info(dev[i], &di) == -1) {
   1.305 +			free(dev_inf->work_item_sizes);
   1.306 +			return -1;
   1.307 +		}
   1.308 +
   1.309 +		printf("--> device %u (%s)\n", i, devtypestr(di.type));
   1.310 +		printf("max compute units: %u\n", di.units);
   1.311 +		printf("max clock frequency: %u\n", di.clock);
   1.312 +		printf("max work item dimensions: %u\n", di.dim);
   1.313 +
   1.314 +		printf("max work item sizes: ");
   1.315 +		for(j=0; j<di.dim; j++) {
   1.316 +			printf("%u", (unsigned int)di.work_item_sizes[j]);
   1.317 +			if(di.dim - j > 1) {
   1.318 +				printf(", ");
   1.319 +			}
   1.320 +		}
   1.321 +		putchar('\n');
   1.322 +
   1.323 +		printf("max work group size: %u\n", (unsigned int)di.work_group_size);
   1.324 +		printf("max object allocation size: ");
   1.325 +		print_memsize(stdout, di.mem_size);
   1.326 +		putchar('\n');
   1.327 +
   1.328 +		if(devcmp(&di, dev_inf) > 0) {
   1.329 +			free(dev_inf->work_item_sizes);
   1.330 +			memcpy(dev_inf, &di, sizeof di);
   1.331 +			sel = i;
   1.332 +		}
   1.333 +	}
   1.334 +
   1.335 +	if(num_dev) {
   1.336 +		printf("\nusing device: %d\n", sel);
   1.337 +		return 0;
   1.338 +	}
   1.339 +
   1.340 +	return -1;
   1.341 +}
   1.342 +
   1.343 +static int get_dev_info(cl_device_id dev, struct device_info *di)
   1.344 +{
   1.345 +	di->id = dev;
   1.346 +
   1.347 +
   1.348 +	clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof di->type, &di->type, 0);
   1.349 +	clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof di->units, &di->units, 0);
   1.350 +	clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof di->clock, &di->clock, 0);
   1.351 +	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof di->dim, &di->dim, 0);
   1.352 +
   1.353 +	di->work_item_sizes = new size_t[di->dim];
   1.354 +
   1.355 +	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, di->dim * sizeof *di->work_item_sizes, di->work_item_sizes, 0);
   1.356 +	clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof di->work_group_size, &di->work_group_size, 0);
   1.357 +	clGetDeviceInfo(dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof di->mem_size, &di->mem_size, 0);
   1.358 +
   1.359 +	return 0;
   1.360 +}
   1.361 +
   1.362 +static int devcmp(struct device_info *a, struct device_info *b)
   1.363 +{
   1.364 +	unsigned int aval = a->units * a->clock;
   1.365 +	unsigned int bval = b->units * b->clock;
   1.366 +
   1.367 +	return aval - bval;
   1.368 +}
   1.369 +
   1.370 +static const char *devtypestr(cl_device_type type)
   1.371 +{
   1.372 +	switch(type) {
   1.373 +	case CL_DEVICE_TYPE_CPU:
   1.374 +		return "cpu";
   1.375 +	case CL_DEVICE_TYPE_GPU:
   1.376 +		return "gpu";
   1.377 +	case CL_DEVICE_TYPE_ACCELERATOR:
   1.378 +		return "accelerator";
   1.379 +	default:
   1.380 +		break;
   1.381 +	}
   1.382 +	return "unknown";
   1.383 +}
   1.384 +
   1.385 +static void print_memsize(FILE *out, unsigned long bytes)
   1.386 +{
   1.387 +	int i;
   1.388 +	unsigned long memsz = bytes;
   1.389 +	const char *suffix[] = {"bytes", "kb", "mb", "gb", "tb", "pb", 0};
   1.390 +
   1.391 +	for(i=0; suffix[i]; i++) {
   1.392 +		if(memsz < 1024) {
   1.393 +			fprintf(out, "%lu %s", memsz, suffix[i]);
   1.394 +			if(i > 0) {
   1.395 +				fprintf(out, " (%lu bytes)", bytes);
   1.396 +			}
   1.397 +			return;
   1.398 +		}
   1.399 +
   1.400 +		memsz /= 1024;
   1.401 +	}
   1.402 +}