clray
changeset 43:f9eec11e5acc
shoehorned the kdtree into an opnecl image and improved performance slightly
author | John Tsiombikas <nuclear@member.fsf.org> |
---|---|
date | Sat, 28 Aug 2010 09:38:49 +0100 |
parents | 1169f3d04135 |
children | e7f79c6ad246 |
files | rt.cl src/clray.cc src/ocl.cc src/rt.cc src/scene.h |
diffstat | 5 files changed, 151 insertions(+), 44 deletions(-) [+] |
line diff
1.1 --- a/rt.cl Sat Aug 28 02:01:16 2010 +0100 1.2 +++ b/rt.cl Sat Aug 28 09:38:49 2010 +0100 1.3 @@ -51,26 +51,28 @@ 1.4 global const struct Light *lights; 1.5 int num_lights; 1.6 global const struct Material *matlib; 1.7 - global const struct KDNode *kdtree; 1.8 + //global const struct KDNode *kdtree; 1.9 }; 1.10 1.11 struct AABBox { 1.12 float4 min, max; 1.13 }; 1.14 1.15 +#define MAX_NODE_FACES 32 1.16 struct KDNode { 1.17 struct AABBox aabb; 1.18 - int face_idx[32]; 1.19 + int face_idx[MAX_NODE_FACES]; 1.20 int num_faces; 1.21 int left, right; 1.22 int padding; 1.23 }; 1.24 1.25 +#define RAY_MAG 500.0 1.26 #define MIN_ENERGY 0.001 1.27 #define EPSILON 1e-5 1.28 1.29 -float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp); 1.30 -bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *sp); 1.31 +float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp, read_only image2d_t kdimg); 1.32 +bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *sp, read_only image2d_t kdimg); 1.33 bool intersect(struct Ray ray, global const struct Face *face, struct SurfPoint *sp); 1.34 bool intersect_aabb(struct Ray ray, struct AABBox aabb); 1.35 1.36 @@ -80,6 +82,8 @@ 1.37 float4 calc_bary(float4 pt, global const struct Face *face, float4 norm); 1.38 float mean(float4 v); 1.39 1.40 +void read_kdnode(int idx, struct KDNode *node, read_only image2d_t kdimg); 1.41 + 1.42 1.43 kernel void render(write_only image2d_t fb, 1.44 global const struct RendInfo *rinf, 1.45 @@ -89,7 +93,8 @@ 1.46 global const struct Ray *primrays, 1.47 global const float *xform, 1.48 global const float *invtrans, 1.49 - global const struct KDNode *kdtree) 1.50 + //global const struct KDNode *kdtree 1.51 + read_only image2d_t kdtree_img) 1.52 { 1.53 int idx = get_global_id(0); 1.54 1.55 @@ -100,7 +105,7 @@ 1.56 scn.lights = lights; 1.57 scn.num_lights = rinf->num_lights; 1.58 scn.matlib = matlib; 1.59 - scn.kdtree = kdtree; 1.60 + //scn.kdtree_img = kdtree_img; 1.61 1.62 struct Ray ray = primrays[idx]; 1.63 transform_ray(&ray, xform, invtrans); 1.64 @@ -111,8 +116,8 @@ 1.65 1.66 while(iter++ < rinf->max_iter && mean(energy) > MIN_ENERGY) { 1.67 struct SurfPoint sp; 1.68 - if(find_intersection(ray, &scn, &sp)) { 1.69 - pixel += shade(ray, &scn, &sp) * energy; 1.70 + if(find_intersection(ray, &scn, &sp, kdtree_img)) { 1.71 + pixel += shade(ray, &scn, &sp, kdtree_img) * energy; 1.72 1.73 float4 refl_col = sp.mat.ks * sp.mat.kr; 1.74 1.75 @@ -121,27 +126,25 @@ 1.76 1.77 energy *= refl_col; 1.78 } else { 1.79 - break; 1.80 + energy = (float4)(0.0, 0.0, 0.0, 0.0); 1.81 } 1.82 } 1.83 1.84 - int img_x = get_image_width(fb); 1.85 - 1.86 int2 coord; 1.87 - coord.x = idx % img_x; 1.88 - coord.y = idx / img_x; 1.89 + coord.x = idx % rinf->xsz; 1.90 + coord.y = idx / rinf->xsz; 1.91 1.92 write_imagef(fb, coord, pixel); 1.93 } 1.94 1.95 -float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp) 1.96 +float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp, read_only image2d_t kdimg) 1.97 { 1.98 float4 norm = sp->norm; 1.99 - bool entering = true; 1.100 + //bool entering = true; 1.101 1.102 if(dot(ray.dir, norm) >= 0.0) { 1.103 norm = -norm; 1.104 - entering = false; 1.105 + //entering = false; 1.106 } 1.107 1.108 float4 dcol = scn->ambient * sp->mat.kd; 1.109 @@ -154,16 +157,19 @@ 1.110 shadowray.origin = sp->pos; 1.111 shadowray.dir = ldir; 1.112 1.113 - if(!find_intersection(shadowray, scn, 0)) { 1.114 + if(!find_intersection(shadowray, scn, 0, kdimg)) { 1.115 ldir = normalize(ldir); 1.116 - float4 vdir = -normalize(ray.dir); 1.117 + float4 vdir = -ray.dir; 1.118 + vdir.x = native_divide(vdir.x, RAY_MAG); 1.119 + vdir.y = native_divide(vdir.y, RAY_MAG); 1.120 + vdir.z = native_divide(vdir.z, RAY_MAG); 1.121 float4 vref = reflect(vdir, norm); 1.122 1.123 float diff = fmax(dot(ldir, norm), 0.0f); 1.124 - dcol += sp->mat.kd * scn->lights[i].color * diff; 1.125 + dcol += sp->mat.kd /* scn->lights[i].color*/ * diff; 1.126 1.127 - float spec = powr(fmax(dot(ldir, vref), 0.0f), sp->mat.spow); 1.128 - scol += sp->mat.ks * scn->lights[i].color * spec; 1.129 + float spec = native_powr(fmax(dot(ldir, vref), 0.0f), sp->mat.spow); 1.130 + scol += sp->mat.ks /* scn->lights[i].color*/ * spec; 1.131 } 1.132 } 1.133 1.134 @@ -171,7 +177,7 @@ 1.135 } 1.136 1.137 #define STACK_SIZE 64 1.138 -bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *spres) 1.139 +bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *spres, read_only image2d_t kdimg) 1.140 { 1.141 struct SurfPoint sp0; 1.142 sp0.t = 1.0; 1.143 @@ -184,14 +190,15 @@ 1.144 while(top > 0) { 1.145 int idx = idxstack[--top]; // remove this index from the stack and process it 1.146 1.147 - global const struct KDNode *node = scn->kdtree + idx; 1.148 + struct KDNode node; 1.149 + read_kdnode(idx, &node, kdimg); 1.150 1.151 - if(intersect_aabb(ray, node->aabb)) { 1.152 - if(node->left == -1) { 1.153 + if(intersect_aabb(ray, node.aabb)) { 1.154 + if(node.left == -1) { 1.155 // leaf node... check each face in turn and update the nearest intersection as needed 1.156 - for(int i=0; i<node->num_faces; i++) { 1.157 + for(int i=0; i<node.num_faces; i++) { 1.158 struct SurfPoint spt; 1.159 - int fidx = node->face_idx[i]; 1.160 + int fidx = node.face_idx[i]; 1.161 1.162 if(intersect(ray, scn->faces + fidx, &spt) && spt.t < sp0.t) { 1.163 sp0 = spt; 1.164 @@ -199,8 +206,8 @@ 1.165 } 1.166 } else { 1.167 // internal node... recurse to the children 1.168 - idxstack[top++] = node->left; 1.169 - idxstack[top++] = node->right; 1.170 + idxstack[top++] = node.left; 1.171 + idxstack[top++] = node.right; 1.172 } 1.173 } 1.174 } 1.175 @@ -232,7 +239,7 @@ 1.176 float4 vec = pt - origin; 1.177 1.178 float ndotvec = dot(norm, vec); 1.179 - float t = ndotvec / ndotdir; 1.180 + float t = native_divide(ndotvec, ndotdir); 1.181 1.182 if(t < EPSILON || t > 1.0) { 1.183 return false; 1.184 @@ -269,12 +276,12 @@ 1.185 }; 1.186 1.187 int xsign = (int)(ray.dir.x < 0.0); 1.188 - float invdirx = 1.0 / ray.dir.x; 1.189 + float invdirx = native_recip(ray.dir.x); 1.190 float tmin = (bbox[xsign].x - ray.origin.x) * invdirx; 1.191 float tmax = (bbox[1 - xsign].x - ray.origin.x) * invdirx; 1.192 1.193 int ysign = (int)(ray.dir.y < 0.0); 1.194 - float invdiry = 1.0 / ray.dir.y; 1.195 + float invdiry = native_recip(ray.dir.y); 1.196 float tymin = (bbox[ysign].y - ray.origin.y) * invdiry; 1.197 float tymax = (bbox[1 - ysign].y - ray.origin.y) * invdiry; 1.198 1.199 @@ -286,7 +293,7 @@ 1.200 if(tymax < tmax) tmax = tymax; 1.201 1.202 int zsign = (int)(ray.dir.z < 0.0); 1.203 - float invdirz = 1.0 / ray.dir.z; 1.204 + float invdirz = native_recip(ray.dir.z); 1.205 float tzmin = (bbox[zsign].z - ray.origin.z) * invdirz; 1.206 float tzmax = (bbox[1 - zsign].z - ray.origin.z) * invdirz; 1.207 1.208 @@ -345,9 +352,9 @@ 1.209 float a1 = fabs(dot(x20, norm)) * 0.5; 1.210 float a2 = fabs(dot(x01, norm)) * 0.5; 1.211 1.212 - bc.x = a0 / area; 1.213 - bc.y = a1 / area; 1.214 - bc.z = a2 / area; 1.215 + bc.x = native_divide(a0, area); 1.216 + bc.y = native_divide(a1, area); 1.217 + bc.z = native_divide(a2, area); 1.218 return bc; 1.219 } 1.220 1.221 @@ -355,3 +362,32 @@ 1.222 { 1.223 return native_divide(v.x + v.y + v.z, 3.0); 1.224 } 1.225 + 1.226 + 1.227 +const sampler_t kdsampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; 1.228 + 1.229 +// read a KD-tree node from a texture scanline 1.230 +void read_kdnode(int idx, struct KDNode *node, read_only image2d_t kdimg) 1.231 +{ 1.232 + int2 tc; 1.233 + tc.x = 0; 1.234 + tc.y = idx; 1.235 + 1.236 + node->aabb.min = read_imagef(kdimg, kdsampler, tc); tc.x++; 1.237 + node->aabb.max = read_imagef(kdimg, kdsampler, tc); 1.238 + 1.239 + tc.x = 2 + MAX_NODE_FACES / 4; 1.240 + float4 pix = read_imagef(kdimg, kdsampler, tc); 1.241 + node->num_faces = (int)pix.x; 1.242 + node->left = (int)pix.y; 1.243 + node->right = (int)pix.z; 1.244 + 1.245 + tc.x = 2; 1.246 + for(int i=0; i<node->num_faces; i+=4) { 1.247 + float4 pix = read_imagef(kdimg, kdsampler, tc); tc.x++; 1.248 + node->face_idx[i] = (int)pix.x; 1.249 + node->face_idx[i + 1] = (int)pix.y; 1.250 + node->face_idx[i + 2] = (int)pix.z; 1.251 + node->face_idx[i + 3] = (int)pix.w; 1.252 + } 1.253 +}
2.1 --- a/src/clray.cc Sat Aug 28 02:01:16 2010 +0100 2.2 +++ b/src/clray.cc Sat Aug 28 09:38:49 2010 +0100 2.3 @@ -34,6 +34,7 @@ 2.4 static Scene scn; 2.5 static unsigned int tex; 2.6 2.7 + 2.8 int main(int argc, char **argv) 2.9 { 2.10 glutInitWindowSize(800, 600);
3.1 --- a/src/ocl.cc Sat Aug 28 02:01:16 2010 +0100 3.2 +++ b/src/ocl.cc Sat Aug 28 09:38:49 2010 +0100 3.3 @@ -503,7 +503,9 @@ 3.4 { 3.5 int err; 3.6 3.7 - if((err = clBuildProgram(prog, 0, 0, "-cl-mad-enable", 0, 0)) != 0) { 3.8 + const char *opt = "-cl-mad-enable -cl-single-precision-constant -cl-fast-relaxed-math"; 3.9 + 3.10 + if((err = clBuildProgram(prog, 0, 0, opt, 0, 0)) != 0) { 3.11 size_t sz; 3.12 clGetProgramBuildInfo(prog, devinf.id, CL_PROGRAM_BUILD_LOG, 0, 0, &sz); 3.13
4.1 --- a/src/rt.cc Sat Aug 28 02:01:16 2010 +0100 4.2 +++ b/src/rt.cc Sat Aug 28 09:38:49 2010 +0100 4.3 @@ -39,6 +39,7 @@ 4.4 }; 4.5 4.6 static Ray get_primary_ray(int x, int y, int w, int h, float vfov_deg); 4.7 +static float *create_kdimage(const KDNodeGPU *kdtree, int num_nodes, int *xsz_ret, int *ysz_ret); 4.8 4.9 static Face *faces; 4.10 static Ray *prim_rays; 4.11 @@ -52,6 +53,9 @@ 4.12 4.13 static RendInfo rinf; 4.14 4.15 +static long timing_sample_sum; 4.16 +static long num_timing_samples; 4.17 + 4.18 4.19 bool init_renderer(int xsz, int ysz, Scene *scn, unsigned int tex) 4.20 { 4.21 @@ -91,7 +95,9 @@ 4.22 fprintf(stderr, "failed to create kdtree buffer\n"); 4.23 return false; 4.24 } 4.25 - // XXX now we can actually destroy the original kdtree and keep only the GPU version 4.26 + 4.27 + int kdimg_xsz, kdimg_ysz; 4.28 + float *kdimg_pixels = create_kdimage(kdbuf, scn->get_num_kdnodes(), &kdimg_xsz, &kdimg_ysz); 4.29 4.30 /* setup argument buffers */ 4.31 #ifdef CLGL_INTEROP 4.32 @@ -106,7 +112,11 @@ 4.33 prog->set_arg_buffer(KARG_PRIM_RAYS, ARG_RD, xsz * ysz * sizeof *prim_rays, prim_rays); 4.34 prog->set_arg_buffer(KARG_XFORM, ARG_RD, 16 * sizeof(float)); 4.35 prog->set_arg_buffer(KARG_INVTRANS_XFORM, ARG_RD, 16 * sizeof(float)); 4.36 - prog->set_arg_buffer(KARG_KDTREE, ARG_RD, scn->get_num_kdnodes() * sizeof *kdbuf, kdbuf); 4.37 + //prog->set_arg_buffer(KARG_KDTREE, ARG_RD, scn->get_num_kdnodes() * sizeof *kdbuf, kdbuf); 4.38 + prog->set_arg_image(KARG_KDTREE, ARG_RD, kdimg_xsz, kdimg_ysz, kdimg_pixels); 4.39 + 4.40 + delete [] kdimg_pixels; 4.41 + 4.42 4.43 if(prog->get_num_args() < NUM_KERNEL_ARGS) { 4.44 return false; 4.45 @@ -125,6 +135,8 @@ 4.46 void destroy_renderer() 4.47 { 4.48 delete prog; 4.49 + 4.50 + printf("rendertime mean: %ld msec\n", timing_sample_sum / num_timing_samples); 4.51 } 4.52 4.53 bool render() 4.54 @@ -172,7 +184,11 @@ 4.55 unmap_mem_buffer(mbuf); 4.56 #endif 4.57 4.58 - printf("rendered in %ld msec\n", get_msec() - tm0); 4.59 + long msec = get_msec() - tm0; 4.60 + timing_sample_sum += msec; 4.61 + num_timing_samples++; 4.62 + 4.63 + printf("rendered in %ld msec\n", msec); 4.64 return true; 4.65 } 4.66 4.67 @@ -273,10 +289,61 @@ 4.68 float py = 1.0 - ((float)y / (float)h) * ysz; 4.69 float pz = 1.0 / tan(0.5 * vfov); 4.70 4.71 - px *= 100.0; 4.72 - py *= 100.0; 4.73 - pz *= 100.0; 4.74 + float mag = sqrt(px * px + py * py + pz * pz); 4.75 + 4.76 + px = px * 500.0 / mag; 4.77 + py = py * 500.0 / mag; 4.78 + pz = pz * 500.0 / mag; 4.79 4.80 Ray ray = {{0, 0, 0, 1}, {px, py, -pz, 1}}; 4.81 return ray; 4.82 } 4.83 + 4.84 +static int next_pow2(int x) 4.85 +{ 4.86 + x--; 4.87 + x = (x >> 1) | x; 4.88 + x = (x >> 2) | x; 4.89 + x = (x >> 4) | x; 4.90 + x = (x >> 8) | x; 4.91 + x = (x >> 16) | x; 4.92 + return x + 1; 4.93 +} 4.94 + 4.95 +static float *create_kdimage(const KDNodeGPU *kdtree, int num_nodes, int *xsz_ret, int *ysz_ret) 4.96 +{ 4.97 + int xsz = 16; 4.98 + int ysz = next_pow2(num_nodes); 4.99 + 4.100 + printf("creating kdtree image %dx%d (%d nodes)\n", xsz, ysz, num_nodes); 4.101 + 4.102 + float *img = new float[4 * xsz * ysz]; 4.103 + memset(img, 0, 4 * xsz * ysz * sizeof *img); 4.104 + 4.105 + for(int i=0; i<num_nodes; i++) { 4.106 + float *ptr = img + i * 4 * xsz; 4.107 + 4.108 + *ptr++ = kdtree[i].aabb.min[0]; 4.109 + *ptr++ = kdtree[i].aabb.min[1]; 4.110 + *ptr++ = kdtree[i].aabb.min[2]; 4.111 + *ptr++ = 0.0; 4.112 + 4.113 + *ptr++ = kdtree[i].aabb.max[0]; 4.114 + *ptr++ = kdtree[i].aabb.max[1]; 4.115 + *ptr++ = kdtree[i].aabb.max[2]; 4.116 + *ptr++ = 0.0; 4.117 + 4.118 + for(int j=0; j<MAX_NODE_FACES; j++) { 4.119 + *ptr++ = j < kdtree[i].num_faces ? (float)kdtree[i].face_idx[j] : 0.0f; 4.120 + } 4.121 + 4.122 + *ptr++ = (float)kdtree[i].num_faces; 4.123 + *ptr++ = (float)kdtree[i].left; 4.124 + *ptr++ = (float)kdtree[i].right; 4.125 + *ptr++ = 0.0; 4.126 + } 4.127 + 4.128 + if(xsz_ret) *xsz_ret = xsz; 4.129 + if(ysz_ret) *ysz_ret = ysz; 4.130 + return img; 4.131 +}
5.1 --- a/src/scene.h Sat Aug 28 02:01:16 2010 +0100 5.2 +++ b/src/scene.h Sat Aug 28 09:38:49 2010 +0100 5.3 @@ -51,9 +51,10 @@ 5.4 KDNode(); 5.5 }; 5.6 5.7 +#define MAX_NODE_FACES 32 5.8 struct KDNodeGPU { 5.9 AABBox aabb; 5.10 - int face_idx[32]; 5.11 + int face_idx[MAX_NODE_FACES]; 5.12 int num_faces; 5.13 int left, right; 5.14 int padding;