clray

changeset 43:f9eec11e5acc

shoehorned the kdtree into an opnecl image and improved performance slightly
author John Tsiombikas <nuclear@member.fsf.org>
date Sat, 28 Aug 2010 09:38:49 +0100
parents 1169f3d04135
children e7f79c6ad246
files rt.cl src/clray.cc src/ocl.cc src/rt.cc src/scene.h
diffstat 5 files changed, 151 insertions(+), 44 deletions(-) [+]
line diff
     1.1 --- a/rt.cl	Sat Aug 28 02:01:16 2010 +0100
     1.2 +++ b/rt.cl	Sat Aug 28 09:38:49 2010 +0100
     1.3 @@ -51,26 +51,28 @@
     1.4  	global const struct Light *lights;
     1.5  	int num_lights;
     1.6  	global const struct Material *matlib;
     1.7 -	global const struct KDNode *kdtree;
     1.8 +	//global const struct KDNode *kdtree;
     1.9  };
    1.10  
    1.11  struct AABBox {
    1.12  	float4 min, max;
    1.13  };
    1.14  
    1.15 +#define MAX_NODE_FACES	32
    1.16  struct KDNode {
    1.17  	struct AABBox aabb;
    1.18 -	int face_idx[32];
    1.19 +	int face_idx[MAX_NODE_FACES];
    1.20  	int num_faces;
    1.21  	int left, right;
    1.22  	int padding;
    1.23  };
    1.24  
    1.25 +#define RAY_MAG		500.0
    1.26  #define MIN_ENERGY	0.001
    1.27  #define EPSILON		1e-5
    1.28  
    1.29 -float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp);
    1.30 -bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *sp);
    1.31 +float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp, read_only image2d_t kdimg);
    1.32 +bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *sp, read_only image2d_t kdimg);
    1.33  bool intersect(struct Ray ray, global const struct Face *face, struct SurfPoint *sp);
    1.34  bool intersect_aabb(struct Ray ray, struct AABBox aabb);
    1.35  
    1.36 @@ -80,6 +82,8 @@
    1.37  float4 calc_bary(float4 pt, global const struct Face *face, float4 norm);
    1.38  float mean(float4 v);
    1.39  
    1.40 +void read_kdnode(int idx, struct KDNode *node, read_only image2d_t kdimg);
    1.41 +
    1.42  
    1.43  kernel void render(write_only image2d_t fb,
    1.44  		global const struct RendInfo *rinf,
    1.45 @@ -89,7 +93,8 @@
    1.46  		global const struct Ray *primrays,
    1.47  		global const float *xform,
    1.48  		global const float *invtrans,
    1.49 -		global const struct KDNode *kdtree)
    1.50 +		//global const struct KDNode *kdtree
    1.51 +		read_only image2d_t kdtree_img)
    1.52  {
    1.53  	int idx = get_global_id(0);
    1.54  
    1.55 @@ -100,7 +105,7 @@
    1.56  	scn.lights = lights;
    1.57  	scn.num_lights = rinf->num_lights;
    1.58  	scn.matlib = matlib;
    1.59 -	scn.kdtree = kdtree;
    1.60 +	//scn.kdtree_img = kdtree_img;
    1.61  
    1.62  	struct Ray ray = primrays[idx];
    1.63  	transform_ray(&ray, xform, invtrans);
    1.64 @@ -111,8 +116,8 @@
    1.65  
    1.66  	while(iter++ < rinf->max_iter && mean(energy) > MIN_ENERGY) {
    1.67  		struct SurfPoint sp;
    1.68 -		if(find_intersection(ray, &scn, &sp)) {
    1.69 -			pixel += shade(ray, &scn, &sp) * energy;
    1.70 +		if(find_intersection(ray, &scn, &sp, kdtree_img)) {
    1.71 +			pixel += shade(ray, &scn, &sp, kdtree_img) * energy;
    1.72  
    1.73  			float4 refl_col = sp.mat.ks * sp.mat.kr;
    1.74  
    1.75 @@ -121,27 +126,25 @@
    1.76  
    1.77  			energy *= refl_col;
    1.78  		} else {
    1.79 -			break;
    1.80 +			energy = (float4)(0.0, 0.0, 0.0, 0.0);
    1.81  		}
    1.82  	}
    1.83  
    1.84 -	int img_x = get_image_width(fb);
    1.85 -
    1.86  	int2 coord;
    1.87 -	coord.x = idx % img_x;
    1.88 -	coord.y = idx / img_x;
    1.89 +	coord.x = idx % rinf->xsz;
    1.90 +	coord.y = idx / rinf->xsz;
    1.91  
    1.92  	write_imagef(fb, coord, pixel);
    1.93  }
    1.94  
    1.95 -float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp)
    1.96 +float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp, read_only image2d_t kdimg)
    1.97  {
    1.98  	float4 norm = sp->norm;
    1.99 -	bool entering = true;
   1.100 +	//bool entering = true;
   1.101  
   1.102  	if(dot(ray.dir, norm) >= 0.0) {
   1.103  		norm = -norm;
   1.104 -		entering = false;
   1.105 +		//entering = false;
   1.106  	}
   1.107  
   1.108  	float4 dcol = scn->ambient * sp->mat.kd;
   1.109 @@ -154,16 +157,19 @@
   1.110  		shadowray.origin = sp->pos;
   1.111  		shadowray.dir = ldir;
   1.112  
   1.113 -		if(!find_intersection(shadowray, scn, 0)) {
   1.114 +		if(!find_intersection(shadowray, scn, 0, kdimg)) {
   1.115  			ldir = normalize(ldir);
   1.116 -			float4 vdir = -normalize(ray.dir);
   1.117 +			float4 vdir = -ray.dir;
   1.118 +			vdir.x = native_divide(vdir.x, RAY_MAG);
   1.119 +			vdir.y = native_divide(vdir.y, RAY_MAG);
   1.120 +			vdir.z = native_divide(vdir.z, RAY_MAG);
   1.121  			float4 vref = reflect(vdir, norm);
   1.122  
   1.123  			float diff = fmax(dot(ldir, norm), 0.0f);
   1.124 -			dcol += sp->mat.kd * scn->lights[i].color * diff;
   1.125 +			dcol += sp->mat.kd /* scn->lights[i].color*/ * diff;
   1.126  
   1.127 -			float spec = powr(fmax(dot(ldir, vref), 0.0f), sp->mat.spow);
   1.128 -			scol += sp->mat.ks * scn->lights[i].color * spec;
   1.129 +			float spec = native_powr(fmax(dot(ldir, vref), 0.0f), sp->mat.spow);
   1.130 +			scol += sp->mat.ks /* scn->lights[i].color*/ * spec;
   1.131  		}
   1.132  	}
   1.133  
   1.134 @@ -171,7 +177,7 @@
   1.135  }
   1.136  
   1.137  #define STACK_SIZE	64
   1.138 -bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *spres)
   1.139 +bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *spres, read_only image2d_t kdimg)
   1.140  {
   1.141  	struct SurfPoint sp0;
   1.142  	sp0.t = 1.0;
   1.143 @@ -184,14 +190,15 @@
   1.144  	while(top > 0) {
   1.145  		int idx = idxstack[--top];	// remove this index from the stack and process it
   1.146  
   1.147 -		global const struct KDNode *node = scn->kdtree + idx;
   1.148 +		struct KDNode node;
   1.149 +		read_kdnode(idx, &node, kdimg);
   1.150  
   1.151 -		if(intersect_aabb(ray, node->aabb)) {
   1.152 -			if(node->left == -1) {
   1.153 +		if(intersect_aabb(ray, node.aabb)) {
   1.154 +			if(node.left == -1) {
   1.155  				// leaf node... check each face in turn and update the nearest intersection as needed
   1.156 -				for(int i=0; i<node->num_faces; i++) {
   1.157 +				for(int i=0; i<node.num_faces; i++) {
   1.158  					struct SurfPoint spt;
   1.159 -					int fidx = node->face_idx[i];
   1.160 +					int fidx = node.face_idx[i];
   1.161  
   1.162  					if(intersect(ray, scn->faces + fidx, &spt) && spt.t < sp0.t) {
   1.163  						sp0 = spt;
   1.164 @@ -199,8 +206,8 @@
   1.165  				}
   1.166  			} else {
   1.167  				// internal node... recurse to the children
   1.168 -				idxstack[top++] = node->left;
   1.169 -				idxstack[top++] = node->right;
   1.170 +				idxstack[top++] = node.left;
   1.171 +				idxstack[top++] = node.right;
   1.172  			}
   1.173  		}
   1.174  	}
   1.175 @@ -232,7 +239,7 @@
   1.176  	float4 vec = pt - origin;
   1.177  
   1.178  	float ndotvec = dot(norm, vec);
   1.179 -	float t = ndotvec / ndotdir;
   1.180 +	float t = native_divide(ndotvec, ndotdir);
   1.181  
   1.182  	if(t < EPSILON || t > 1.0) {
   1.183  		return false;
   1.184 @@ -269,12 +276,12 @@
   1.185  	};
   1.186  
   1.187  	int xsign = (int)(ray.dir.x < 0.0);
   1.188 -	float invdirx = 1.0 / ray.dir.x;
   1.189 +	float invdirx = native_recip(ray.dir.x);
   1.190  	float tmin = (bbox[xsign].x - ray.origin.x) * invdirx;
   1.191  	float tmax = (bbox[1 - xsign].x - ray.origin.x) * invdirx;
   1.192  
   1.193  	int ysign = (int)(ray.dir.y < 0.0);
   1.194 -	float invdiry = 1.0 / ray.dir.y;
   1.195 +	float invdiry = native_recip(ray.dir.y);
   1.196  	float tymin = (bbox[ysign].y - ray.origin.y) * invdiry;
   1.197  	float tymax = (bbox[1 - ysign].y - ray.origin.y) * invdiry;
   1.198  
   1.199 @@ -286,7 +293,7 @@
   1.200  	if(tymax < tmax) tmax = tymax;
   1.201  
   1.202  	int zsign = (int)(ray.dir.z < 0.0);
   1.203 -	float invdirz = 1.0 / ray.dir.z;
   1.204 +	float invdirz = native_recip(ray.dir.z);
   1.205  	float tzmin = (bbox[zsign].z - ray.origin.z) * invdirz;
   1.206  	float tzmax = (bbox[1 - zsign].z - ray.origin.z) * invdirz;
   1.207  
   1.208 @@ -345,9 +352,9 @@
   1.209  	float a1 = fabs(dot(x20, norm)) * 0.5;
   1.210  	float a2 = fabs(dot(x01, norm)) * 0.5;
   1.211  
   1.212 -	bc.x = a0 / area;
   1.213 -	bc.y = a1 / area;
   1.214 -	bc.z = a2 / area;
   1.215 +	bc.x = native_divide(a0, area);
   1.216 +	bc.y = native_divide(a1, area);
   1.217 +	bc.z = native_divide(a2, area);
   1.218  	return bc;
   1.219  }
   1.220  
   1.221 @@ -355,3 +362,32 @@
   1.222  {
   1.223  	return native_divide(v.x + v.y + v.z, 3.0);
   1.224  }
   1.225 +
   1.226 +
   1.227 +const sampler_t kdsampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
   1.228 +
   1.229 +// read a KD-tree node from a texture scanline
   1.230 +void read_kdnode(int idx, struct KDNode *node, read_only image2d_t kdimg)
   1.231 +{
   1.232 +	int2 tc;
   1.233 +	tc.x = 0;
   1.234 +	tc.y = idx;
   1.235 +
   1.236 +	node->aabb.min = read_imagef(kdimg, kdsampler, tc); tc.x++;
   1.237 +	node->aabb.max = read_imagef(kdimg, kdsampler, tc);
   1.238 +
   1.239 +	tc.x = 2 + MAX_NODE_FACES / 4;
   1.240 +	float4 pix = read_imagef(kdimg, kdsampler, tc);
   1.241 +	node->num_faces = (int)pix.x;
   1.242 +	node->left = (int)pix.y;
   1.243 +	node->right = (int)pix.z;
   1.244 +
   1.245 +	tc.x = 2;
   1.246 +	for(int i=0; i<node->num_faces; i+=4) {
   1.247 +		float4 pix = read_imagef(kdimg, kdsampler, tc); tc.x++;
   1.248 +		node->face_idx[i] = (int)pix.x;
   1.249 +		node->face_idx[i + 1] = (int)pix.y;
   1.250 +		node->face_idx[i + 2] = (int)pix.z;
   1.251 +		node->face_idx[i + 3] = (int)pix.w;
   1.252 +	}
   1.253 +}
     2.1 --- a/src/clray.cc	Sat Aug 28 02:01:16 2010 +0100
     2.2 +++ b/src/clray.cc	Sat Aug 28 09:38:49 2010 +0100
     2.3 @@ -34,6 +34,7 @@
     2.4  static Scene scn;
     2.5  static unsigned int tex;
     2.6  
     2.7 +
     2.8  int main(int argc, char **argv)
     2.9  {
    2.10  	glutInitWindowSize(800, 600);
     3.1 --- a/src/ocl.cc	Sat Aug 28 02:01:16 2010 +0100
     3.2 +++ b/src/ocl.cc	Sat Aug 28 09:38:49 2010 +0100
     3.3 @@ -503,7 +503,9 @@
     3.4  {
     3.5  	int err;
     3.6  
     3.7 -	if((err = clBuildProgram(prog, 0, 0, "-cl-mad-enable", 0, 0)) != 0) {
     3.8 +	const char *opt = "-cl-mad-enable -cl-single-precision-constant -cl-fast-relaxed-math";
     3.9 +
    3.10 +	if((err = clBuildProgram(prog, 0, 0, opt, 0, 0)) != 0) {
    3.11  		size_t sz;
    3.12  		clGetProgramBuildInfo(prog, devinf.id, CL_PROGRAM_BUILD_LOG, 0, 0, &sz);
    3.13  
     4.1 --- a/src/rt.cc	Sat Aug 28 02:01:16 2010 +0100
     4.2 +++ b/src/rt.cc	Sat Aug 28 09:38:49 2010 +0100
     4.3 @@ -39,6 +39,7 @@
     4.4  };
     4.5  
     4.6  static Ray get_primary_ray(int x, int y, int w, int h, float vfov_deg);
     4.7 +static float *create_kdimage(const KDNodeGPU *kdtree, int num_nodes, int *xsz_ret, int *ysz_ret);
     4.8  
     4.9  static Face *faces;
    4.10  static Ray *prim_rays;
    4.11 @@ -52,6 +53,9 @@
    4.12  
    4.13  static RendInfo rinf;
    4.14  
    4.15 +static long timing_sample_sum;
    4.16 +static long num_timing_samples;
    4.17 +
    4.18  
    4.19  bool init_renderer(int xsz, int ysz, Scene *scn, unsigned int tex)
    4.20  {
    4.21 @@ -91,7 +95,9 @@
    4.22  		fprintf(stderr, "failed to create kdtree buffer\n");
    4.23  		return false;
    4.24  	}
    4.25 -	// XXX now we can actually destroy the original kdtree and keep only the GPU version
    4.26 +
    4.27 +	int kdimg_xsz, kdimg_ysz;
    4.28 +	float *kdimg_pixels = create_kdimage(kdbuf, scn->get_num_kdnodes(), &kdimg_xsz, &kdimg_ysz);
    4.29  
    4.30  	/* setup argument buffers */
    4.31  #ifdef CLGL_INTEROP
    4.32 @@ -106,7 +112,11 @@
    4.33  	prog->set_arg_buffer(KARG_PRIM_RAYS, ARG_RD, xsz * ysz * sizeof *prim_rays, prim_rays);
    4.34  	prog->set_arg_buffer(KARG_XFORM, ARG_RD, 16 * sizeof(float));
    4.35  	prog->set_arg_buffer(KARG_INVTRANS_XFORM, ARG_RD, 16 * sizeof(float));
    4.36 -	prog->set_arg_buffer(KARG_KDTREE, ARG_RD, scn->get_num_kdnodes() * sizeof *kdbuf, kdbuf);
    4.37 +	//prog->set_arg_buffer(KARG_KDTREE, ARG_RD, scn->get_num_kdnodes() * sizeof *kdbuf, kdbuf);
    4.38 +	prog->set_arg_image(KARG_KDTREE, ARG_RD, kdimg_xsz, kdimg_ysz, kdimg_pixels);
    4.39 +
    4.40 +	delete [] kdimg_pixels;
    4.41 +
    4.42  
    4.43  	if(prog->get_num_args() < NUM_KERNEL_ARGS) {
    4.44  		return false;
    4.45 @@ -125,6 +135,8 @@
    4.46  void destroy_renderer()
    4.47  {
    4.48  	delete prog;
    4.49 +
    4.50 +	printf("rendertime mean: %ld msec\n", timing_sample_sum / num_timing_samples);
    4.51  }
    4.52  
    4.53  bool render()
    4.54 @@ -172,7 +184,11 @@
    4.55  	unmap_mem_buffer(mbuf);
    4.56  #endif
    4.57  
    4.58 -	printf("rendered in %ld msec\n", get_msec() - tm0);
    4.59 +	long msec = get_msec() - tm0;
    4.60 +	timing_sample_sum += msec;
    4.61 +	num_timing_samples++;
    4.62 +
    4.63 +	printf("rendered in %ld msec\n", msec);
    4.64  	return true;
    4.65  }
    4.66  
    4.67 @@ -273,10 +289,61 @@
    4.68  	float py = 1.0 - ((float)y / (float)h) * ysz;
    4.69  	float pz = 1.0 / tan(0.5 * vfov);
    4.70  
    4.71 -	px *= 100.0;
    4.72 -	py *= 100.0;
    4.73 -	pz *= 100.0;
    4.74 +	float mag = sqrt(px * px + py * py + pz * pz);
    4.75 +
    4.76 +	px = px * 500.0 / mag;
    4.77 +	py = py * 500.0 / mag;
    4.78 +	pz = pz * 500.0 / mag;
    4.79  
    4.80  	Ray ray = {{0, 0, 0, 1}, {px, py, -pz, 1}};
    4.81  	return ray;
    4.82  }
    4.83 +
    4.84 +static int next_pow2(int x)
    4.85 +{
    4.86 +	x--;
    4.87 +	x = (x >> 1) | x;
    4.88 +	x = (x >> 2) | x;
    4.89 +	x = (x >> 4) | x;
    4.90 +	x = (x >> 8) | x;
    4.91 +	x = (x >> 16) | x;
    4.92 +	return x + 1;
    4.93 +}
    4.94 +
    4.95 +static float *create_kdimage(const KDNodeGPU *kdtree, int num_nodes, int *xsz_ret, int *ysz_ret)
    4.96 +{
    4.97 +	int xsz = 16;
    4.98 +	int ysz = next_pow2(num_nodes);
    4.99 +
   4.100 +	printf("creating kdtree image %dx%d (%d nodes)\n", xsz, ysz, num_nodes);
   4.101 +
   4.102 +	float *img = new float[4 * xsz * ysz];
   4.103 +	memset(img, 0, 4 * xsz * ysz * sizeof *img);
   4.104 +
   4.105 +	for(int i=0; i<num_nodes; i++) {
   4.106 +		float *ptr = img + i * 4 * xsz;
   4.107 +
   4.108 +		*ptr++ = kdtree[i].aabb.min[0];
   4.109 +		*ptr++ = kdtree[i].aabb.min[1];
   4.110 +		*ptr++ = kdtree[i].aabb.min[2];
   4.111 +		*ptr++ = 0.0;
   4.112 +
   4.113 +		*ptr++ = kdtree[i].aabb.max[0];
   4.114 +		*ptr++ = kdtree[i].aabb.max[1];
   4.115 +		*ptr++ = kdtree[i].aabb.max[2];
   4.116 +		*ptr++ = 0.0;
   4.117 +
   4.118 +		for(int j=0; j<MAX_NODE_FACES; j++) {
   4.119 +			*ptr++ = j < kdtree[i].num_faces ? (float)kdtree[i].face_idx[j] : 0.0f;
   4.120 +		}
   4.121 +
   4.122 +		*ptr++ = (float)kdtree[i].num_faces;
   4.123 +		*ptr++ = (float)kdtree[i].left;
   4.124 +		*ptr++ = (float)kdtree[i].right;
   4.125 +		*ptr++ = 0.0;
   4.126 +	}
   4.127 +
   4.128 +	if(xsz_ret) *xsz_ret = xsz;
   4.129 +	if(ysz_ret) *ysz_ret = ysz;
   4.130 +	return img;
   4.131 +}
     5.1 --- a/src/scene.h	Sat Aug 28 02:01:16 2010 +0100
     5.2 +++ b/src/scene.h	Sat Aug 28 09:38:49 2010 +0100
     5.3 @@ -51,9 +51,10 @@
     5.4  	KDNode();
     5.5  };
     5.6  
     5.7 +#define MAX_NODE_FACES	32
     5.8  struct KDNodeGPU {
     5.9  	AABBox aabb;
    5.10 -	int face_idx[32];
    5.11 +	int face_idx[MAX_NODE_FACES];
    5.12  	int num_faces;
    5.13  	int left, right;
    5.14  	int padding;