clray

changeset 32:4cf4919c3812

performance sucks
author John Tsiombikas <nuclear@member.fsf.org>
date Tue, 24 Aug 2010 05:43:57 +0100
parents 92786fc3317e
children 931d13b72f83
files src/clray.cc src/ocl.cc src/rt.cc src/scene.cc src/scene.h
diffstat 5 files changed, 65 insertions(+), 67 deletions(-) [+]
line diff
     1.1 --- a/src/clray.cc	Sun Aug 22 00:50:47 2010 +0100
     1.2 +++ b/src/clray.cc	Tue Aug 24 05:43:57 2010 +0100
     1.3 @@ -26,7 +26,7 @@
     1.4  static float cam_theta, cam_phi = 25.0;
     1.5  static float cam_dist = 10.0;
     1.6  
     1.7 -static bool dbg_glrender = true;
     1.8 +static bool dbg_glrender = false;
     1.9  static bool dbg_show_kdtree = false;
    1.10  static bool dbg_show_obj = true;
    1.11  
    1.12 @@ -107,6 +107,12 @@
    1.13  	}
    1.14  	atexit(cleanup);
    1.15  
    1.16 +	unsigned int *test_pattern = new unsigned int[xsz * ysz];
    1.17 +	for(int i=0; i<ysz; i++) {
    1.18 +		for(int j=0; j<xsz; j++) {
    1.19 +			test_pattern[i * xsz + j] = ((i >> 4) & 1) == ((j >> 4) & 1) ? 0xff0000 : 0xff00;
    1.20 +		}
    1.21 +	}
    1.22  
    1.23  	/*glGenTextures(1, &tex);
    1.24  	glBindTexture(GL_TEXTURE_2D, tex);*/
    1.25 @@ -114,7 +120,8 @@
    1.26  	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
    1.27  	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
    1.28  	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    1.29 -	glTexImage2D(GL_TEXTURE_2D, 0, 4, xsz, ysz, 0, GL_RGBA, GL_FLOAT, 0);
    1.30 +	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB32F, xsz, ysz, 0, GL_RGBA, GL_UNSIGNED_BYTE, test_pattern);
    1.31 +	delete [] test_pattern;
    1.32  
    1.33  	glutMainLoop();
    1.34  	return 0;
     2.1 --- a/src/ocl.cc	Sun Aug 22 00:50:47 2010 +0100
     2.2 +++ b/src/ocl.cc	Tue Aug 24 05:43:57 2010 +0100
     2.3 @@ -377,10 +377,15 @@
     2.4  	va_end(ap);
     2.5  
     2.6  	int err;
     2.7 -	if((err = clEnqueueNDRangeKernel(cmdq, kernel, dim, 0, global_size, 0, 0, 0, 0)) != 0) {
     2.8 +	cl_event event;
     2.9 +
    2.10 +	if((err = clEnqueueNDRangeKernel(cmdq, kernel, dim, 0, global_size, 0, 0, 0, &event)) != 0) {
    2.11  		fprintf(stderr, "error executing kernel: %s\n", clstrerror(err));
    2.12  		return false;
    2.13  	}
    2.14 +
    2.15 +	clWaitForEvents(1, &event);
    2.16 +	clReleaseEvent(event);
    2.17  	return true;
    2.18  }
    2.19  
     3.1 --- a/src/rt.cc	Sun Aug 22 00:50:47 2010 +0100
     3.2 +++ b/src/rt.cc	Tue Aug 24 05:43:57 2010 +0100
     3.3 @@ -5,6 +5,7 @@
     3.4  #include "ogl.h"
     3.5  #include "ocl.h"
     3.6  #include "scene.h"
     3.7 +#include "timer.h"
     3.8  
     3.9  // kernel arguments
    3.10  enum {
    3.11 @@ -90,6 +91,7 @@
    3.12  		fprintf(stderr, "failed to create kdtree buffer\n");
    3.13  		return false;
    3.14  	}
    3.15 +	// XXX now we can actually destroy the original kdtree and keep only the GPU version
    3.16  
    3.17  	/* setup argument buffers */
    3.18  	prog->set_arg_buffer(KARG_FRAMEBUFFER, ARG_WR, xsz * ysz * 4 * sizeof(float));
    3.19 @@ -123,10 +125,14 @@
    3.20  
    3.21  bool render()
    3.22  {
    3.23 +	long tm0 = get_msec();
    3.24 +
    3.25  	if(!prog->run(1, global_size)) {
    3.26  		return false;
    3.27  	}
    3.28  
    3.29 +	long tm_run = get_msec() - tm0;
    3.30 +
    3.31  	CLMemBuffer *mbuf = prog->get_arg_buffer(KARG_FRAMEBUFFER);
    3.32  	void *fb = map_mem_buffer(mbuf, MAP_RD);
    3.33  	if(!fb) {
    3.34 @@ -134,14 +140,12 @@
    3.35  		return false;
    3.36  	}
    3.37  
    3.38 -	static int foo = 0;
    3.39 -	if(!foo++) {
    3.40 -		bool write_ppm(const char *fname, float *fb, int xsz, int ysz);
    3.41 -		write_ppm("foo.ppm", (float*)fb, rinf.xsz, rinf.ysz);
    3.42 -	}
    3.43 -
    3.44  	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, rinf.xsz, rinf.ysz, GL_RGBA, GL_FLOAT, fb);
    3.45  	unmap_mem_buffer(mbuf);
    3.46 +
    3.47 +	long tm_upd = get_msec() - tm0 - tm_run;
    3.48 +
    3.49 +	printf("render %ld msec (%ld run, %ld upd)\n", tm_run + tm_upd, tm_run, tm_upd);
    3.50  	return true;
    3.51  }
    3.52  
     4.1 --- a/src/scene.cc	Sun Aug 22 00:50:47 2010 +0100
     4.2 +++ b/src/scene.cc	Tue Aug 24 05:43:57 2010 +0100
     4.3 @@ -6,10 +6,10 @@
     4.4  
     4.5  
     4.6  static void draw_kdtree(const KDNode *node, int level = 0);
     4.7 -static bool build_kdtree(KDNode *kd, int level = 0);
     4.8 -static float eval_cost(const std::list<const Face*> &faces, const AABBox &aabb, int axis, float par_sarea = 1.0);
     4.9 +static bool build_kdtree(KDNode *kd, const Face *faces, int level = 0);
    4.10 +static float eval_cost(const Face *faces, const int *face_idx, int num_faces, const AABBox &aabb, int axis);
    4.11  static void free_kdtree(KDNode *node);
    4.12 -static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node, const Face *facebuf);
    4.13 +static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node);
    4.14  static void print_item_counts(const KDNode *node, int level);
    4.15  
    4.16  
    4.17 @@ -58,10 +58,8 @@
    4.18  
    4.19  KDNode::KDNode()
    4.20  {
    4.21 -	axis = 0;
    4.22 -	pt = 0.0;
    4.23  	left = right = 0;
    4.24 -	num_faces = 0;
    4.25 +	cost = 0.0;
    4.26  }
    4.27  
    4.28  
    4.29 @@ -175,7 +173,7 @@
    4.30  	printf("allocating storage for the complete tree (%d)\n", max_nodes);
    4.31  
    4.32  	kdbuf = new KDNodeGPU[max_nodes + 1];
    4.33 -	kdtree_gpu_flatten(kdbuf, 1, kdtree, get_face_buffer());
    4.34 +	kdtree_gpu_flatten(kdbuf, 1, kdtree);
    4.35  	return kdbuf;
    4.36  }
    4.37  
    4.38 @@ -298,15 +296,14 @@
    4.39  			}
    4.40  		}
    4.41  
    4.42 -		kdtree->faces.push_back(face);	// add the face
    4.43 -		kdtree->num_faces++;
    4.44 +		kdtree->face_idx.push_back(i);	// add the face
    4.45  	}
    4.46  
    4.47  	// calculate the heuristic for the root
    4.48 -	kdtree->cost = eval_cost(kdtree->faces, kdtree->aabb, kdtree->axis);
    4.49 +	kdtree->cost = eval_cost(faces, &kdtree->face_idx[0], kdtree->face_idx.size(), kdtree->aabb, 0);
    4.50  
    4.51  	// now proceed splitting the root recursively
    4.52 -	if(!::build_kdtree(kdtree)) {
    4.53 +	if(!::build_kdtree(kdtree, faces)) {
    4.54  		fprintf(stderr, "failed to build kdtree\n");
    4.55  		return false;
    4.56  	}
    4.57 @@ -316,36 +313,34 @@
    4.58  	return true;
    4.59  }
    4.60  
    4.61 -static bool build_kdtree(KDNode *kd, int level)
    4.62 +static bool build_kdtree(KDNode *kd, const Face *faces, int level)
    4.63  {
    4.64  	int opt_max_depth = accel_param[ACCEL_PARAM_MAX_TREE_DEPTH];
    4.65  	int opt_max_items = accel_param[ACCEL_PARAM_MAX_NODE_ITEMS];
    4.66  	int tcost = accel_param[ACCEL_PARAM_COST_TRAVERSE];
    4.67  
    4.68 -	if(kd->num_faces == 0 || level >= opt_max_depth) {
    4.69 +	if(kd->face_idx.empty() || level >= opt_max_depth) {
    4.70  		return true;
    4.71  	}
    4.72  
    4.73  	int axis = level % 3;
    4.74 -	//float parent_sa = kd->aabb.calc_surface_area();
    4.75  
    4.76  	float best_cost[2], best_sum_cost = FLT_MAX;
    4.77  	float best_split;
    4.78  
    4.79 -	std::list<const Face*>::iterator it = kd->faces.begin();
    4.80 -	while(it != kd->faces.end()) {
    4.81 -		const Face *face = *it++;
    4.82 +	for(size_t i=0; i<kd->face_idx.size(); i++) {
    4.83 +		const Face *face = faces + kd->face_idx[i];
    4.84  
    4.85 -		for(int i=0; i<3; i++) {
    4.86 +		for(int j=0; j<3; j++) {
    4.87  			AABBox aabb_left, aabb_right;
    4.88 -			const float *split = face->v[i].pos;
    4.89 +			const float *split = face->v[j].pos;
    4.90  
    4.91  			aabb_left = aabb_right = kd->aabb;
    4.92  			aabb_left.max[axis] = split[axis];
    4.93  			aabb_right.min[axis] = split[axis];
    4.94  
    4.95 -			float left_cost = eval_cost(kd->faces, aabb_left, axis);
    4.96 -			float right_cost = eval_cost(kd->faces, aabb_right, axis);
    4.97 +			float left_cost = eval_cost(faces, &kd->face_idx[0], kd->face_idx.size(), aabb_left, axis);
    4.98 +			float right_cost = eval_cost(faces, &kd->face_idx[0], kd->face_idx.size(), aabb_right, axis);
    4.99  			float sum_cost = left_cost + right_cost - tcost;	// tcost is added twice
   4.100  
   4.101  			if(sum_cost < best_sum_cost) {
   4.102 @@ -358,10 +353,9 @@
   4.103  	}
   4.104  
   4.105  	//printf("current cost: %f,   best_cost: %f\n", kd->cost, best_sum_cost);
   4.106 -	if(best_sum_cost > kd->cost && (opt_max_items == 0 || kd->num_faces <= opt_max_items)) {
   4.107 +	if(best_sum_cost > kd->cost && (opt_max_items == 0 || (int)kd->face_idx.size() <= opt_max_items)) {
   4.108  		return true;	// stop splitting if it doesn't reduce the cost
   4.109  	}
   4.110 -	kd->pt = best_split;
   4.111  
   4.112  	// create the two children
   4.113  	KDNode *kdleft, *kdright;
   4.114 @@ -376,45 +370,40 @@
   4.115  	kdleft->cost = best_cost[0];
   4.116  	kdright->cost = best_cost[1];
   4.117  
   4.118 -	//kdleft->axis = kdright->axis = (axis + 1) % 3;
   4.119 -
   4.120 -	it = kd->faces.begin();
   4.121 -	while(it != kd->faces.end()) {
   4.122 -		const Face *face = *it++;
   4.123 +	for(size_t i=0; i<kd->face_idx.size(); i++) {
   4.124 +		int fidx = kd->face_idx[i];
   4.125 +		const Face *face = faces + fidx;
   4.126  
   4.127  		if(face->v[0].pos[axis] < best_split ||
   4.128  				face->v[1].pos[axis] < best_split ||
   4.129  				face->v[2].pos[axis] < best_split) {
   4.130 -			kdleft->faces.push_back(face);
   4.131 -			kdleft->num_faces++;
   4.132 +			kdleft->face_idx.push_back(fidx);
   4.133  		}
   4.134  		if(face->v[0].pos[axis] >= best_split ||
   4.135  				face->v[1].pos[axis] >= best_split ||
   4.136  				face->v[2].pos[axis] >= best_split) {
   4.137 -			kdright->faces.push_back(face);
   4.138 -			kdright->num_faces++;
   4.139 +			kdright->face_idx.push_back(fidx);
   4.140  		}
   4.141  	}
   4.142 -	kd->faces.clear();	// only leaves have faces
   4.143 +	kd->face_idx.clear();	// only leaves have faces
   4.144  
   4.145  	kd->left = kdleft;
   4.146  	kd->right = kdright;
   4.147  
   4.148 -	return build_kdtree(kd->left, level + 1) && build_kdtree(kd->right, level + 1);
   4.149 +	return build_kdtree(kd->left, faces, level + 1) && build_kdtree(kd->right, faces, level + 1);
   4.150  }
   4.151  
   4.152 -static float eval_cost(const std::list<const Face*> &faces, const AABBox &aabb, int axis, float par_sarea)
   4.153 +static float eval_cost(const Face *faces, const int *face_idx, int num_faces, const AABBox &aabb, int axis)
   4.154  {
   4.155  	int num_inside = 0;
   4.156  	int tcost = accel_param[ACCEL_PARAM_COST_TRAVERSE];
   4.157  	int icost = accel_param[ACCEL_PARAM_COST_INTERSECT];
   4.158  
   4.159 -	std::list<const Face*>::const_iterator it = faces.begin();
   4.160 -	while(it != faces.end()) {
   4.161 -		const Face *face = *it++;
   4.162 +	for(int i=0; i<num_faces; i++) {
   4.163 +		const Face *face = faces + face_idx[i];
   4.164  
   4.165 -		for(int i=0; i<3; i++) {
   4.166 -			if(face->v[i].pos[axis] >= aabb.min[axis] && face->v[i].pos[axis] < aabb.max[axis]) {
   4.167 +		for(int j=0; j<3; j++) {
   4.168 +			if(face->v[j].pos[axis] >= aabb.min[axis] && face->v[j].pos[axis] < aabb.max[axis]) {
   4.169  				num_inside++;
   4.170  				break;
   4.171  			}
   4.172 @@ -426,7 +415,7 @@
   4.173  		return FLT_MAX;	// heavily penalize 0-area voxels
   4.174  	}
   4.175  
   4.176 -	return tcost + (sarea / par_sarea) * num_inside * icost;
   4.177 +	return tcost + sarea * num_inside * icost;
   4.178  }
   4.179  
   4.180  static void free_kdtree(KDNode *node)
   4.181 @@ -454,20 +443,19 @@
   4.182  }
   4.183  
   4.184  #define MAX_FACES	(sizeof dest->face_idx / sizeof *dest->face_idx)
   4.185 -static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node, const Face *facebuf)
   4.186 +static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node)
   4.187  {
   4.188  	KDNodeGPU *dest = kdbuf + idx;
   4.189  
   4.190  	dest->aabb = node->aabb;
   4.191  	dest->num_faces = 0;
   4.192  
   4.193 -	std::list<const Face*>::const_iterator it = node->faces.begin();
   4.194 -	while(it != node->faces.end()) {
   4.195 +	for(size_t i=0; i<node->face_idx.size(); i++) {
   4.196  		if(dest->num_faces >= (int)MAX_FACES) {
   4.197  			fprintf(stderr, "kdtree_gpu_flatten WARNING: more than %d faces in node, skipping!\n", (int)MAX_FACES);
   4.198  			break;
   4.199  		}
   4.200 -		dest->face_idx[dest->num_faces++] = *it++ - facebuf;
   4.201 +		dest->face_idx[dest->num_faces++] = node->face_idx[i];
   4.202  	}
   4.203  
   4.204  	if(node->left) {
   4.205 @@ -476,8 +464,8 @@
   4.206  
   4.207  		dest->num_faces = -1;
   4.208  
   4.209 -		kdtree_gpu_flatten(kdbuf, idx * 2, node->left, facebuf);
   4.210 -		kdtree_gpu_flatten(kdbuf, idx * 2 + 1, node->right, facebuf);
   4.211 +		kdtree_gpu_flatten(kdbuf, idx * 2, node->left);
   4.212 +		kdtree_gpu_flatten(kdbuf, idx * 2 + 1, node->right);
   4.213  	}
   4.214  }
   4.215  
   4.216 @@ -488,7 +476,7 @@
   4.217  	for(int i=0; i<level; i++) {
   4.218  		fputs("   ", stdout);
   4.219  	}
   4.220 -	printf("- %d (cost: %f)\n", node->num_faces, node->cost);
   4.221 +	printf("- %d (cost: %f)\n", (int)node->face_idx.size(), node->cost);
   4.222  
   4.223  	print_item_counts(node->left, level + 1);
   4.224  	print_item_counts(node->right, level + 1);
     5.1 --- a/src/scene.h	Sun Aug 22 00:50:47 2010 +0100
     5.2 +++ b/src/scene.h	Tue Aug 24 05:43:57 2010 +0100
     5.3 @@ -40,21 +40,12 @@
     5.4  	float calc_surface_area() const;
     5.5  };
     5.6  
     5.7 -enum {
     5.8 -	KDAXIS_X,
     5.9 -	KDAXIS_Y,
    5.10 -	KDAXIS_Z
    5.11 -};
    5.12 -
    5.13  struct KDNode {
    5.14 -	int axis;
    5.15 -	float pt;
    5.16  	AABBox aabb;
    5.17  	float cost;
    5.18  
    5.19  	KDNode *left, *right;
    5.20 -	int num_faces;	// cause on some implementations list::size() is O(n)
    5.21 -	std::list<const Face*> faces;
    5.22 +	std::vector<int> face_idx;
    5.23  
    5.24  	KDNode();
    5.25  };
    5.26 @@ -115,4 +106,7 @@
    5.27  int kdtree_depth(const KDNode *tree);
    5.28  int kdtree_nodes(const KDNode *tree);
    5.29  
    5.30 +bool kdtree_dump(const KDNode *tree, const char *fname);
    5.31 +KDNode *kdtree_restore(const char *fname);
    5.32 +
    5.33  #endif	/* MESH_H_ */