clray
changeset 32:4cf4919c3812
performance sucks
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Tue, 24 Aug 2010 05:43:57 +0100 |
parents | 92786fc3317e |
children | 931d13b72f83 |
files | src/clray.cc src/ocl.cc src/rt.cc src/scene.cc src/scene.h |
diffstat | 5 files changed, 65 insertions(+), 67 deletions(-) [+] |
line diff
1.1 --- a/src/clray.cc Sun Aug 22 00:50:47 2010 +0100 1.2 +++ b/src/clray.cc Tue Aug 24 05:43:57 2010 +0100 1.3 @@ -26,7 +26,7 @@ 1.4 static float cam_theta, cam_phi = 25.0; 1.5 static float cam_dist = 10.0; 1.6 1.7 -static bool dbg_glrender = true; 1.8 +static bool dbg_glrender = false; 1.9 static bool dbg_show_kdtree = false; 1.10 static bool dbg_show_obj = true; 1.11 1.12 @@ -107,6 +107,12 @@ 1.13 } 1.14 atexit(cleanup); 1.15 1.16 + unsigned int *test_pattern = new unsigned int[xsz * ysz]; 1.17 + for(int i=0; i<ysz; i++) { 1.18 + for(int j=0; j<xsz; j++) { 1.19 + test_pattern[i * xsz + j] = ((i >> 4) & 1) == ((j >> 4) & 1) ? 0xff0000 : 0xff00; 1.20 + } 1.21 + } 1.22 1.23 /*glGenTextures(1, &tex); 1.24 glBindTexture(GL_TEXTURE_2D, tex);*/ 1.25 @@ -114,7 +120,8 @@ 1.26 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP); 1.27 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); 1.28 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); 1.29 - glTexImage2D(GL_TEXTURE_2D, 0, 4, xsz, ysz, 0, GL_RGBA, GL_FLOAT, 0); 1.30 + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB32F, xsz, ysz, 0, GL_RGBA, GL_UNSIGNED_BYTE, test_pattern); 1.31 + delete [] test_pattern; 1.32 1.33 glutMainLoop(); 1.34 return 0;
2.1 --- a/src/ocl.cc Sun Aug 22 00:50:47 2010 +0100 2.2 +++ b/src/ocl.cc Tue Aug 24 05:43:57 2010 +0100 2.3 @@ -377,10 +377,15 @@ 2.4 va_end(ap); 2.5 2.6 int err; 2.7 - if((err = clEnqueueNDRangeKernel(cmdq, kernel, dim, 0, global_size, 0, 0, 0, 0)) != 0) { 2.8 + cl_event event; 2.9 + 2.10 + if((err = clEnqueueNDRangeKernel(cmdq, kernel, dim, 0, global_size, 0, 0, 0, &event)) != 0) { 2.11 fprintf(stderr, "error executing kernel: %s\n", clstrerror(err)); 2.12 return false; 2.13 } 2.14 + 2.15 + clWaitForEvents(1, &event); 2.16 + clReleaseEvent(event); 2.17 return true; 2.18 } 2.19
3.1 --- a/src/rt.cc Sun Aug 22 00:50:47 2010 +0100 3.2 +++ b/src/rt.cc Tue Aug 24 05:43:57 2010 +0100 3.3 @@ -5,6 +5,7 @@ 3.4 #include "ogl.h" 3.5 #include "ocl.h" 3.6 #include "scene.h" 3.7 +#include "timer.h" 3.8 3.9 // kernel arguments 3.10 enum { 3.11 @@ -90,6 +91,7 @@ 3.12 fprintf(stderr, "failed to create kdtree buffer\n"); 3.13 return false; 3.14 } 3.15 + // XXX now we can actually destroy the original kdtree and keep only the GPU version 3.16 3.17 /* setup argument buffers */ 3.18 prog->set_arg_buffer(KARG_FRAMEBUFFER, ARG_WR, xsz * ysz * 4 * sizeof(float)); 3.19 @@ -123,10 +125,14 @@ 3.20 3.21 bool render() 3.22 { 3.23 + long tm0 = get_msec(); 3.24 + 3.25 if(!prog->run(1, global_size)) { 3.26 return false; 3.27 } 3.28 3.29 + long tm_run = get_msec() - tm0; 3.30 + 3.31 CLMemBuffer *mbuf = prog->get_arg_buffer(KARG_FRAMEBUFFER); 3.32 void *fb = map_mem_buffer(mbuf, MAP_RD); 3.33 if(!fb) { 3.34 @@ -134,14 +140,12 @@ 3.35 return false; 3.36 } 3.37 3.38 - static int foo = 0; 3.39 - if(!foo++) { 3.40 - bool write_ppm(const char *fname, float *fb, int xsz, int ysz); 3.41 - write_ppm("foo.ppm", (float*)fb, rinf.xsz, rinf.ysz); 3.42 - } 3.43 - 3.44 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, rinf.xsz, rinf.ysz, GL_RGBA, GL_FLOAT, fb); 3.45 unmap_mem_buffer(mbuf); 3.46 + 3.47 + long tm_upd = get_msec() - tm0 - tm_run; 3.48 + 3.49 + printf("render %ld msec (%ld run, %ld upd)\n", tm_run + tm_upd, tm_run, tm_upd); 3.50 return true; 3.51 } 3.52
4.1 --- a/src/scene.cc Sun Aug 22 00:50:47 2010 +0100 4.2 +++ b/src/scene.cc Tue Aug 24 05:43:57 2010 +0100 4.3 @@ -6,10 +6,10 @@ 4.4 4.5 4.6 static void draw_kdtree(const KDNode *node, int level = 0); 4.7 -static bool build_kdtree(KDNode *kd, int level = 0); 4.8 -static float eval_cost(const std::list<const Face*> &faces, const AABBox &aabb, int axis, float par_sarea = 1.0); 4.9 +static bool build_kdtree(KDNode *kd, const Face *faces, int level = 0); 4.10 +static float eval_cost(const Face *faces, const int *face_idx, int num_faces, const AABBox &aabb, int axis); 4.11 static void free_kdtree(KDNode *node); 4.12 -static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node, const Face *facebuf); 4.13 +static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node); 4.14 static void print_item_counts(const KDNode *node, int level); 4.15 4.16 4.17 @@ -58,10 +58,8 @@ 4.18 4.19 KDNode::KDNode() 4.20 { 4.21 - axis = 0; 4.22 - pt = 0.0; 4.23 left = right = 0; 4.24 - num_faces = 0; 4.25 + cost = 0.0; 4.26 } 4.27 4.28 4.29 @@ -175,7 +173,7 @@ 4.30 printf("allocating storage for the complete tree (%d)\n", max_nodes); 4.31 4.32 kdbuf = new KDNodeGPU[max_nodes + 1]; 4.33 - kdtree_gpu_flatten(kdbuf, 1, kdtree, get_face_buffer()); 4.34 + kdtree_gpu_flatten(kdbuf, 1, kdtree); 4.35 return kdbuf; 4.36 } 4.37 4.38 @@ -298,15 +296,14 @@ 4.39 } 4.40 } 4.41 4.42 - kdtree->faces.push_back(face); // add the face 4.43 - kdtree->num_faces++; 4.44 + kdtree->face_idx.push_back(i); // add the face 4.45 } 4.46 4.47 // calculate the heuristic for the root 4.48 - kdtree->cost = eval_cost(kdtree->faces, kdtree->aabb, kdtree->axis); 4.49 + kdtree->cost = eval_cost(faces, &kdtree->face_idx[0], kdtree->face_idx.size(), kdtree->aabb, 0); 4.50 4.51 // now proceed splitting the root recursively 4.52 - if(!::build_kdtree(kdtree)) { 4.53 + if(!::build_kdtree(kdtree, faces)) { 4.54 fprintf(stderr, "failed to build kdtree\n"); 4.55 return false; 4.56 } 4.57 @@ -316,36 +313,34 @@ 4.58 return true; 4.59 } 4.60 4.61 -static bool build_kdtree(KDNode *kd, int level) 4.62 +static bool build_kdtree(KDNode *kd, const Face *faces, int level) 4.63 { 4.64 int opt_max_depth = accel_param[ACCEL_PARAM_MAX_TREE_DEPTH]; 4.65 int opt_max_items = accel_param[ACCEL_PARAM_MAX_NODE_ITEMS]; 4.66 int tcost = accel_param[ACCEL_PARAM_COST_TRAVERSE]; 4.67 4.68 - if(kd->num_faces == 0 || level >= opt_max_depth) { 4.69 + if(kd->face_idx.empty() || level >= opt_max_depth) { 4.70 return true; 4.71 } 4.72 4.73 int axis = level % 3; 4.74 - //float parent_sa = kd->aabb.calc_surface_area(); 4.75 4.76 float best_cost[2], best_sum_cost = FLT_MAX; 4.77 float best_split; 4.78 4.79 - std::list<const Face*>::iterator it = kd->faces.begin(); 4.80 - while(it != kd->faces.end()) { 4.81 - const Face *face = *it++; 4.82 + for(size_t i=0; i<kd->face_idx.size(); i++) { 4.83 + const Face *face = faces + kd->face_idx[i]; 4.84 4.85 - for(int i=0; i<3; i++) { 4.86 + for(int j=0; j<3; j++) { 4.87 AABBox aabb_left, aabb_right; 4.88 - const float *split = face->v[i].pos; 4.89 + const float *split = face->v[j].pos; 4.90 4.91 aabb_left = aabb_right = kd->aabb; 4.92 aabb_left.max[axis] = split[axis]; 4.93 aabb_right.min[axis] = split[axis]; 4.94 4.95 - float left_cost = eval_cost(kd->faces, aabb_left, axis); 4.96 - float right_cost = eval_cost(kd->faces, aabb_right, axis); 4.97 + float left_cost = eval_cost(faces, &kd->face_idx[0], kd->face_idx.size(), aabb_left, axis); 4.98 + float right_cost = eval_cost(faces, &kd->face_idx[0], kd->face_idx.size(), aabb_right, axis); 4.99 float sum_cost = left_cost + right_cost - tcost; // tcost is added twice 4.100 4.101 if(sum_cost < best_sum_cost) { 4.102 @@ -358,10 +353,9 @@ 4.103 } 4.104 4.105 //printf("current cost: %f, best_cost: %f\n", kd->cost, best_sum_cost); 4.106 - if(best_sum_cost > kd->cost && (opt_max_items == 0 || kd->num_faces <= opt_max_items)) { 4.107 + if(best_sum_cost > kd->cost && (opt_max_items == 0 || (int)kd->face_idx.size() <= opt_max_items)) { 4.108 return true; // stop splitting if it doesn't reduce the cost 4.109 } 4.110 - kd->pt = best_split; 4.111 4.112 // create the two children 4.113 KDNode *kdleft, *kdright; 4.114 @@ -376,45 +370,40 @@ 4.115 kdleft->cost = best_cost[0]; 4.116 kdright->cost = best_cost[1]; 4.117 4.118 - //kdleft->axis = kdright->axis = (axis + 1) % 3; 4.119 - 4.120 - it = kd->faces.begin(); 4.121 - while(it != kd->faces.end()) { 4.122 - const Face *face = *it++; 4.123 + for(size_t i=0; i<kd->face_idx.size(); i++) { 4.124 + int fidx = kd->face_idx[i]; 4.125 + const Face *face = faces + fidx; 4.126 4.127 if(face->v[0].pos[axis] < best_split || 4.128 face->v[1].pos[axis] < best_split || 4.129 face->v[2].pos[axis] < best_split) { 4.130 - kdleft->faces.push_back(face); 4.131 - kdleft->num_faces++; 4.132 + kdleft->face_idx.push_back(fidx); 4.133 } 4.134 if(face->v[0].pos[axis] >= best_split || 4.135 face->v[1].pos[axis] >= best_split || 4.136 face->v[2].pos[axis] >= best_split) { 4.137 - kdright->faces.push_back(face); 4.138 - kdright->num_faces++; 4.139 + kdright->face_idx.push_back(fidx); 4.140 } 4.141 } 4.142 - kd->faces.clear(); // only leaves have faces 4.143 + kd->face_idx.clear(); // only leaves have faces 4.144 4.145 kd->left = kdleft; 4.146 kd->right = kdright; 4.147 4.148 - return build_kdtree(kd->left, level + 1) && build_kdtree(kd->right, level + 1); 4.149 + return build_kdtree(kd->left, faces, level + 1) && build_kdtree(kd->right, faces, level + 1); 4.150 } 4.151 4.152 -static float eval_cost(const std::list<const Face*> &faces, const AABBox &aabb, int axis, float par_sarea) 4.153 +static float eval_cost(const Face *faces, const int *face_idx, int num_faces, const AABBox &aabb, int axis) 4.154 { 4.155 int num_inside = 0; 4.156 int tcost = accel_param[ACCEL_PARAM_COST_TRAVERSE]; 4.157 int icost = accel_param[ACCEL_PARAM_COST_INTERSECT]; 4.158 4.159 - std::list<const Face*>::const_iterator it = faces.begin(); 4.160 - while(it != faces.end()) { 4.161 - const Face *face = *it++; 4.162 + for(int i=0; i<num_faces; i++) { 4.163 + const Face *face = faces + face_idx[i]; 4.164 4.165 - for(int i=0; i<3; i++) { 4.166 - if(face->v[i].pos[axis] >= aabb.min[axis] && face->v[i].pos[axis] < aabb.max[axis]) { 4.167 + for(int j=0; j<3; j++) { 4.168 + if(face->v[j].pos[axis] >= aabb.min[axis] && face->v[j].pos[axis] < aabb.max[axis]) { 4.169 num_inside++; 4.170 break; 4.171 } 4.172 @@ -426,7 +415,7 @@ 4.173 return FLT_MAX; // heavily penalize 0-area voxels 4.174 } 4.175 4.176 - return tcost + (sarea / par_sarea) * num_inside * icost; 4.177 + return tcost + sarea * num_inside * icost; 4.178 } 4.179 4.180 static void free_kdtree(KDNode *node) 4.181 @@ -454,20 +443,19 @@ 4.182 } 4.183 4.184 #define MAX_FACES (sizeof dest->face_idx / sizeof *dest->face_idx) 4.185 -static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node, const Face *facebuf) 4.186 +static void kdtree_gpu_flatten(KDNodeGPU *kdbuf, int idx, const KDNode *node) 4.187 { 4.188 KDNodeGPU *dest = kdbuf + idx; 4.189 4.190 dest->aabb = node->aabb; 4.191 dest->num_faces = 0; 4.192 4.193 - std::list<const Face*>::const_iterator it = node->faces.begin(); 4.194 - while(it != node->faces.end()) { 4.195 + for(size_t i=0; i<node->face_idx.size(); i++) { 4.196 if(dest->num_faces >= (int)MAX_FACES) { 4.197 fprintf(stderr, "kdtree_gpu_flatten WARNING: more than %d faces in node, skipping!\n", (int)MAX_FACES); 4.198 break; 4.199 } 4.200 - dest->face_idx[dest->num_faces++] = *it++ - facebuf; 4.201 + dest->face_idx[dest->num_faces++] = node->face_idx[i]; 4.202 } 4.203 4.204 if(node->left) { 4.205 @@ -476,8 +464,8 @@ 4.206 4.207 dest->num_faces = -1; 4.208 4.209 - kdtree_gpu_flatten(kdbuf, idx * 2, node->left, facebuf); 4.210 - kdtree_gpu_flatten(kdbuf, idx * 2 + 1, node->right, facebuf); 4.211 + kdtree_gpu_flatten(kdbuf, idx * 2, node->left); 4.212 + kdtree_gpu_flatten(kdbuf, idx * 2 + 1, node->right); 4.213 } 4.214 } 4.215 4.216 @@ -488,7 +476,7 @@ 4.217 for(int i=0; i<level; i++) { 4.218 fputs(" ", stdout); 4.219 } 4.220 - printf("- %d (cost: %f)\n", node->num_faces, node->cost); 4.221 + printf("- %d (cost: %f)\n", (int)node->face_idx.size(), node->cost); 4.222 4.223 print_item_counts(node->left, level + 1); 4.224 print_item_counts(node->right, level + 1);
5.1 --- a/src/scene.h Sun Aug 22 00:50:47 2010 +0100 5.2 +++ b/src/scene.h Tue Aug 24 05:43:57 2010 +0100 5.3 @@ -40,21 +40,12 @@ 5.4 float calc_surface_area() const; 5.5 }; 5.6 5.7 -enum { 5.8 - KDAXIS_X, 5.9 - KDAXIS_Y, 5.10 - KDAXIS_Z 5.11 -}; 5.12 - 5.13 struct KDNode { 5.14 - int axis; 5.15 - float pt; 5.16 AABBox aabb; 5.17 float cost; 5.18 5.19 KDNode *left, *right; 5.20 - int num_faces; // cause on some implementations list::size() is O(n) 5.21 - std::list<const Face*> faces; 5.22 + std::vector<int> face_idx; 5.23 5.24 KDNode(); 5.25 }; 5.26 @@ -115,4 +106,7 @@ 5.27 int kdtree_depth(const KDNode *tree); 5.28 int kdtree_nodes(const KDNode *tree); 5.29 5.30 +bool kdtree_dump(const KDNode *tree, const char *fname); 5.31 +KDNode *kdtree_restore(const char *fname); 5.32 + 5.33 #endif /* MESH_H_ */