# HG changeset patch # User John Tsiombikas # Date 1282984729 -3600 # Node ID f9eec11e5accc059c9e193d4e645b312f2a94627 # Parent 1169f3d04135283a3de8638985f84deaac1dc7da shoehorned the kdtree into an opnecl image and improved performance slightly diff -r 1169f3d04135 -r f9eec11e5acc rt.cl --- a/rt.cl Sat Aug 28 02:01:16 2010 +0100 +++ b/rt.cl Sat Aug 28 09:38:49 2010 +0100 @@ -51,26 +51,28 @@ global const struct Light *lights; int num_lights; global const struct Material *matlib; - global const struct KDNode *kdtree; + //global const struct KDNode *kdtree; }; struct AABBox { float4 min, max; }; +#define MAX_NODE_FACES 32 struct KDNode { struct AABBox aabb; - int face_idx[32]; + int face_idx[MAX_NODE_FACES]; int num_faces; int left, right; int padding; }; +#define RAY_MAG 500.0 #define MIN_ENERGY 0.001 #define EPSILON 1e-5 -float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp); -bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *sp); +float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp, read_only image2d_t kdimg); +bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *sp, read_only image2d_t kdimg); bool intersect(struct Ray ray, global const struct Face *face, struct SurfPoint *sp); bool intersect_aabb(struct Ray ray, struct AABBox aabb); @@ -80,6 +82,8 @@ float4 calc_bary(float4 pt, global const struct Face *face, float4 norm); float mean(float4 v); +void read_kdnode(int idx, struct KDNode *node, read_only image2d_t kdimg); + kernel void render(write_only image2d_t fb, global const struct RendInfo *rinf, @@ -89,7 +93,8 @@ global const struct Ray *primrays, global const float *xform, global const float *invtrans, - global const struct KDNode *kdtree) + //global const struct KDNode *kdtree + read_only image2d_t kdtree_img) { int idx = get_global_id(0); @@ -100,7 +105,7 @@ scn.lights = lights; scn.num_lights = rinf->num_lights; scn.matlib = matlib; - scn.kdtree = kdtree; + //scn.kdtree_img = kdtree_img; struct Ray ray = primrays[idx]; transform_ray(&ray, xform, invtrans); @@ -111,8 +116,8 @@ while(iter++ < rinf->max_iter && mean(energy) > MIN_ENERGY) { struct SurfPoint sp; - if(find_intersection(ray, &scn, &sp)) { - pixel += shade(ray, &scn, &sp) * energy; + if(find_intersection(ray, &scn, &sp, kdtree_img)) { + pixel += shade(ray, &scn, &sp, kdtree_img) * energy; float4 refl_col = sp.mat.ks * sp.mat.kr; @@ -121,27 +126,25 @@ energy *= refl_col; } else { - break; + energy = (float4)(0.0, 0.0, 0.0, 0.0); } } - int img_x = get_image_width(fb); - int2 coord; - coord.x = idx % img_x; - coord.y = idx / img_x; + coord.x = idx % rinf->xsz; + coord.y = idx / rinf->xsz; write_imagef(fb, coord, pixel); } -float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp) +float4 shade(struct Ray ray, struct Scene *scn, const struct SurfPoint *sp, read_only image2d_t kdimg) { float4 norm = sp->norm; - bool entering = true; + //bool entering = true; if(dot(ray.dir, norm) >= 0.0) { norm = -norm; - entering = false; + //entering = false; } float4 dcol = scn->ambient * sp->mat.kd; @@ -154,16 +157,19 @@ shadowray.origin = sp->pos; shadowray.dir = ldir; - if(!find_intersection(shadowray, scn, 0)) { + if(!find_intersection(shadowray, scn, 0, kdimg)) { ldir = normalize(ldir); - float4 vdir = -normalize(ray.dir); + float4 vdir = -ray.dir; + vdir.x = native_divide(vdir.x, RAY_MAG); + vdir.y = native_divide(vdir.y, RAY_MAG); + vdir.z = native_divide(vdir.z, RAY_MAG); float4 vref = reflect(vdir, norm); float diff = fmax(dot(ldir, norm), 0.0f); - dcol += sp->mat.kd * scn->lights[i].color * diff; + dcol += sp->mat.kd /* scn->lights[i].color*/ * diff; - float spec = powr(fmax(dot(ldir, vref), 0.0f), sp->mat.spow); - scol += sp->mat.ks * scn->lights[i].color * spec; + float spec = native_powr(fmax(dot(ldir, vref), 0.0f), sp->mat.spow); + scol += sp->mat.ks /* scn->lights[i].color*/ * spec; } } @@ -171,7 +177,7 @@ } #define STACK_SIZE 64 -bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *spres) +bool find_intersection(struct Ray ray, const struct Scene *scn, struct SurfPoint *spres, read_only image2d_t kdimg) { struct SurfPoint sp0; sp0.t = 1.0; @@ -184,14 +190,15 @@ while(top > 0) { int idx = idxstack[--top]; // remove this index from the stack and process it - global const struct KDNode *node = scn->kdtree + idx; + struct KDNode node; + read_kdnode(idx, &node, kdimg); - if(intersect_aabb(ray, node->aabb)) { - if(node->left == -1) { + if(intersect_aabb(ray, node.aabb)) { + if(node.left == -1) { // leaf node... check each face in turn and update the nearest intersection as needed - for(int i=0; inum_faces; i++) { + for(int i=0; iface_idx[i]; + int fidx = node.face_idx[i]; if(intersect(ray, scn->faces + fidx, &spt) && spt.t < sp0.t) { sp0 = spt; @@ -199,8 +206,8 @@ } } else { // internal node... recurse to the children - idxstack[top++] = node->left; - idxstack[top++] = node->right; + idxstack[top++] = node.left; + idxstack[top++] = node.right; } } } @@ -232,7 +239,7 @@ float4 vec = pt - origin; float ndotvec = dot(norm, vec); - float t = ndotvec / ndotdir; + float t = native_divide(ndotvec, ndotdir); if(t < EPSILON || t > 1.0) { return false; @@ -269,12 +276,12 @@ }; int xsign = (int)(ray.dir.x < 0.0); - float invdirx = 1.0 / ray.dir.x; + float invdirx = native_recip(ray.dir.x); float tmin = (bbox[xsign].x - ray.origin.x) * invdirx; float tmax = (bbox[1 - xsign].x - ray.origin.x) * invdirx; int ysign = (int)(ray.dir.y < 0.0); - float invdiry = 1.0 / ray.dir.y; + float invdiry = native_recip(ray.dir.y); float tymin = (bbox[ysign].y - ray.origin.y) * invdiry; float tymax = (bbox[1 - ysign].y - ray.origin.y) * invdiry; @@ -286,7 +293,7 @@ if(tymax < tmax) tmax = tymax; int zsign = (int)(ray.dir.z < 0.0); - float invdirz = 1.0 / ray.dir.z; + float invdirz = native_recip(ray.dir.z); float tzmin = (bbox[zsign].z - ray.origin.z) * invdirz; float tzmax = (bbox[1 - zsign].z - ray.origin.z) * invdirz; @@ -345,9 +352,9 @@ float a1 = fabs(dot(x20, norm)) * 0.5; float a2 = fabs(dot(x01, norm)) * 0.5; - bc.x = a0 / area; - bc.y = a1 / area; - bc.z = a2 / area; + bc.x = native_divide(a0, area); + bc.y = native_divide(a1, area); + bc.z = native_divide(a2, area); return bc; } @@ -355,3 +362,32 @@ { return native_divide(v.x + v.y + v.z, 3.0); } + + +const sampler_t kdsampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + +// read a KD-tree node from a texture scanline +void read_kdnode(int idx, struct KDNode *node, read_only image2d_t kdimg) +{ + int2 tc; + tc.x = 0; + tc.y = idx; + + node->aabb.min = read_imagef(kdimg, kdsampler, tc); tc.x++; + node->aabb.max = read_imagef(kdimg, kdsampler, tc); + + tc.x = 2 + MAX_NODE_FACES / 4; + float4 pix = read_imagef(kdimg, kdsampler, tc); + node->num_faces = (int)pix.x; + node->left = (int)pix.y; + node->right = (int)pix.z; + + tc.x = 2; + for(int i=0; inum_faces; i+=4) { + float4 pix = read_imagef(kdimg, kdsampler, tc); tc.x++; + node->face_idx[i] = (int)pix.x; + node->face_idx[i + 1] = (int)pix.y; + node->face_idx[i + 2] = (int)pix.z; + node->face_idx[i + 3] = (int)pix.w; + } +} diff -r 1169f3d04135 -r f9eec11e5acc src/clray.cc --- a/src/clray.cc Sat Aug 28 02:01:16 2010 +0100 +++ b/src/clray.cc Sat Aug 28 09:38:49 2010 +0100 @@ -34,6 +34,7 @@ static Scene scn; static unsigned int tex; + int main(int argc, char **argv) { glutInitWindowSize(800, 600); diff -r 1169f3d04135 -r f9eec11e5acc src/ocl.cc --- a/src/ocl.cc Sat Aug 28 02:01:16 2010 +0100 +++ b/src/ocl.cc Sat Aug 28 09:38:49 2010 +0100 @@ -503,7 +503,9 @@ { int err; - if((err = clBuildProgram(prog, 0, 0, "-cl-mad-enable", 0, 0)) != 0) { + const char *opt = "-cl-mad-enable -cl-single-precision-constant -cl-fast-relaxed-math"; + + if((err = clBuildProgram(prog, 0, 0, opt, 0, 0)) != 0) { size_t sz; clGetProgramBuildInfo(prog, devinf.id, CL_PROGRAM_BUILD_LOG, 0, 0, &sz); diff -r 1169f3d04135 -r f9eec11e5acc src/rt.cc --- a/src/rt.cc Sat Aug 28 02:01:16 2010 +0100 +++ b/src/rt.cc Sat Aug 28 09:38:49 2010 +0100 @@ -39,6 +39,7 @@ }; static Ray get_primary_ray(int x, int y, int w, int h, float vfov_deg); +static float *create_kdimage(const KDNodeGPU *kdtree, int num_nodes, int *xsz_ret, int *ysz_ret); static Face *faces; static Ray *prim_rays; @@ -52,6 +53,9 @@ static RendInfo rinf; +static long timing_sample_sum; +static long num_timing_samples; + bool init_renderer(int xsz, int ysz, Scene *scn, unsigned int tex) { @@ -91,7 +95,9 @@ fprintf(stderr, "failed to create kdtree buffer\n"); return false; } - // XXX now we can actually destroy the original kdtree and keep only the GPU version + + int kdimg_xsz, kdimg_ysz; + float *kdimg_pixels = create_kdimage(kdbuf, scn->get_num_kdnodes(), &kdimg_xsz, &kdimg_ysz); /* setup argument buffers */ #ifdef CLGL_INTEROP @@ -106,7 +112,11 @@ prog->set_arg_buffer(KARG_PRIM_RAYS, ARG_RD, xsz * ysz * sizeof *prim_rays, prim_rays); prog->set_arg_buffer(KARG_XFORM, ARG_RD, 16 * sizeof(float)); prog->set_arg_buffer(KARG_INVTRANS_XFORM, ARG_RD, 16 * sizeof(float)); - prog->set_arg_buffer(KARG_KDTREE, ARG_RD, scn->get_num_kdnodes() * sizeof *kdbuf, kdbuf); + //prog->set_arg_buffer(KARG_KDTREE, ARG_RD, scn->get_num_kdnodes() * sizeof *kdbuf, kdbuf); + prog->set_arg_image(KARG_KDTREE, ARG_RD, kdimg_xsz, kdimg_ysz, kdimg_pixels); + + delete [] kdimg_pixels; + if(prog->get_num_args() < NUM_KERNEL_ARGS) { return false; @@ -125,6 +135,8 @@ void destroy_renderer() { delete prog; + + printf("rendertime mean: %ld msec\n", timing_sample_sum / num_timing_samples); } bool render() @@ -172,7 +184,11 @@ unmap_mem_buffer(mbuf); #endif - printf("rendered in %ld msec\n", get_msec() - tm0); + long msec = get_msec() - tm0; + timing_sample_sum += msec; + num_timing_samples++; + + printf("rendered in %ld msec\n", msec); return true; } @@ -273,10 +289,61 @@ float py = 1.0 - ((float)y / (float)h) * ysz; float pz = 1.0 / tan(0.5 * vfov); - px *= 100.0; - py *= 100.0; - pz *= 100.0; + float mag = sqrt(px * px + py * py + pz * pz); + + px = px * 500.0 / mag; + py = py * 500.0 / mag; + pz = pz * 500.0 / mag; Ray ray = {{0, 0, 0, 1}, {px, py, -pz, 1}}; return ray; } + +static int next_pow2(int x) +{ + x--; + x = (x >> 1) | x; + x = (x >> 2) | x; + x = (x >> 4) | x; + x = (x >> 8) | x; + x = (x >> 16) | x; + return x + 1; +} + +static float *create_kdimage(const KDNodeGPU *kdtree, int num_nodes, int *xsz_ret, int *ysz_ret) +{ + int xsz = 16; + int ysz = next_pow2(num_nodes); + + printf("creating kdtree image %dx%d (%d nodes)\n", xsz, ysz, num_nodes); + + float *img = new float[4 * xsz * ysz]; + memset(img, 0, 4 * xsz * ysz * sizeof *img); + + for(int i=0; i