diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp index ff90343..e0e2ad3 100644 --- a/tiny_bvh_gpu.cpp +++ b/tiny_bvh_gpu.cpp @@ -30,9 +30,7 @@ static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triD // View pyramid for a pinhole camera struct RenderData { - bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ); - bvhvec4 view = bvhvec4( -1, 0, 0, 0 ); - bvhvec4 C, p0, p1, p2; + bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ), view = bvhvec4( -1, 0, 0, 0 ), C, p0, p1, p2; uint32_t frameIdx, dummy1, dummy2, dummy3; } rd; @@ -46,6 +44,15 @@ void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, in for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++) *(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4; } +void AddQuad( const bvhvec3 pos, const float w, const float d, int c ) +{ + bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (triCount + 2) * 48 ); + if (tris) memcpy( data + 6, tris, triCount * 48 ), tinybvh::free64( tris ); + data[0] = bvhvec3( -w, 0, -d ), data[1] = bvhvec3( w, 0, -d ); + data[2] = bvhvec3( w, 0, d ), data[3] = bvhvec3( -w, 0, -d ), tris = data; + data[4] = bvhvec3( w, 0, d ), data[5] = bvhvec3( -w, 0, d ), triCount += 2; + for( int i = 0; i < 6; i++ ) data[i] = 0.5f * data[i] + pos, data[i].w = *(float*)&c; +} Buffer* cwbvhNodes = 0; Buffer* cwbvhTris = 0; @@ -73,6 +80,7 @@ void Init() // load raw vertex data AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff ); AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff ); + AddQuad( bvhvec3( 0, 30, -1 ), 9, 5, 0x1ffffff ); // build bvh (here: 'compressed wide bvh', for efficient GPU rendering) bvh.Build( tris, triCount ); // create gpu buffers @@ -99,12 +107,9 @@ bool UpdateCamera( float delta_time_s, fenster& f ) bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right ); // get camera controls. bool moved = false; - if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true; - if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true; - if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true; - if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true; - if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true; - if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true; + if (f.keys['A'] || f.keys['D']) rd.eye += right * delta_time_s * (f.keys['D'] ? 10 : -10), moved = true; + if (f.keys['W'] || f.keys['S']) rd.eye += rd.view * delta_time_s * (f.keys['W'] ? 10 : -10), moved = true; + if (f.keys['R'] || f.keys['F']) rd.eye += up * delta_time_s * (f.keys['R'] ? 20 : -20), moved = true; if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true; if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true; if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true; @@ -132,7 +137,7 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf ) init->Run( 1 ); // init atomic counters, set buffer ptrs etc. generate->SetArguments( raysOut ); generate->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) ); - for (int i = 0; i < 4; i++) + for (int i = 0; i < 2; i++) { swap( raysOut, raysIn ); extend->SetArguments( raysIn ); diff --git a/traverse.cl b/traverse.cl index 0011022..35906d1 100644 --- a/traverse.cl +++ b/traverse.cl @@ -832,6 +832,254 @@ float4 traverse_cwbvh( global const float4* cwbvhNodes, global const float4* cwb return hit; } +bool isoccluded_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, const float3 O, const float3 D, const float3 rD, const float t ) +{ + // initialize ray + const unsigned threadId = get_global_id( 0 ); +#ifdef SIMD_AABBTEST + const float4 O4 = (float4)( O, 1 ); // rayData[threadId].O; + const float4 D4 = (float4)( D, 0 ); // rayData[threadId].D; + const float4 rD4 = (float4)( rD, 1 ); // rayData[threadId].rD; +#endif // otherwise, we'll use the float3 input directly. + // prepare traversal + uint2 stack[STACK_SIZE]; + uint stackPtr = 0; + float tmax = t; +#ifdef SIMD_AABBTEST + const uint octinv4 = (7 - ((D4.x < 0 ? 4 : 0) | (D4.y < 0 ? 2 : 0) | (D4.z < 0 ? 1 : 0))) * 0x1010101; +#else + const uint octinv4 = (7 - ((D.x < 0 ? 4 : 0) | (D.y < 0 ? 2 : 0) | (D.z < 0 ? 1 : 0))) * 0x1010101; +#endif + uint2 ngroup = (uint2)(0, 0b10000000000000000000000000000000), tgroup = (uint2)(0); + do + { + if (ngroup.y > 0x00FFFFFF) + { + const unsigned hits = ngroup.y, imask = ngroup.y; + const unsigned child_bit_index = __bfind( hits ); + const unsigned child_node_base_index = ngroup.x; + ngroup.y &= ~(1 << child_bit_index); + if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( ngroup ); } + { + const unsigned slot_index = (child_bit_index - 24) ^ (octinv4 & 255); + const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) ); + const unsigned child_node_index = child_node_base_index + relative_index; + #ifdef USE_VLOAD_VSTORE + const float* p = (float*)&cwbvhNodes[child_node_index * 5 + 0]; + float4 n0 = vload4( 0, p ), n1 = vload4( 1, p ), n2 = vload4( 2, p ); + float4 n3 = vload4( 3, p ), n4 = vload4( 4, p ); + #else + float4 n0 = cwbvhNodes[child_node_index * 5 + 0], n1 = cwbvhNodes[child_node_index * 5 + 1]; + float4 n2 = cwbvhNodes[child_node_index * 5 + 2], n3 = cwbvhNodes[child_node_index * 5 + 3]; + float4 n4 = cwbvhNodes[child_node_index * 5 + 4]; + #endif + const char4 e = as_char4( n0.w ); + ngroup.x = as_uint( n1.x ), tgroup = (uint2)(as_uint( n1.y ), 0); + unsigned hitmask = 0; + #ifdef SIMD_AABBTEST + const float4 idir4 = (float4)( + as_float( (e.x + 127) << 23 ) * rD4.x, as_float( (e.y + 127) << 23 ) * rD4.y, + as_float( (e.z + 127) << 23 ) * rD4.z, 1 + ); + const float4 orig4 = (n0 - O4) * rD4; + #else + const float idirx = as_float( (e.x + 127) << 23 ) * rD.x; + const float idiry = as_float( (e.y + 127) << 23 ) * rD.y; + const float idirz = as_float( (e.z + 127) << 23 ) * rD.z; + const float origx = (n0.x - O.x) * rD.x; + const float origy = (n0.y - O.y) * rD.y; + const float origz = (n0.z - O.z) * rD.z; + #endif + { // first 4 + const unsigned meta4 = as_uint( n1.z ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010; + const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 ); + const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F; + const unsigned child_bits4 = (meta4 >> 5) & 0x07070707; + const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.z : n2.x ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.x : n3.z ) ); + const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.x : n2.z ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.z : n4.x ) ); + const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.z : n3.x ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.x : n4.z ) ); + { + #ifdef SIMD_AABBTEST + const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx; + const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy; + const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz; + const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 ); + const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax ); + const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 ); + const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax ); + if (cmina <= cmaxa) UPDATE_HITMASK; + if (cminb <= cmaxb) UPDATE_HITMASK1; + #else + float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx ); + float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy ); + float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz ); + float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx ); + float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy ); + float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz ); + n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 ); + n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax ); + n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 ); + n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax ); + if (n0.x <= n0.y) UPDATE_HITMASK; + if (n1.x <= n1.y) UPDATE_HITMASK1; + #endif + #ifdef SIMD_AABBTEST + const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 ); + const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax ); + const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 ); + const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax ); + if (cminc <= cmaxc) UPDATE_HITMASK2; + if (cmind <= cmaxd) UPDATE_HITMASK3; + #else + tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx ); + tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy ); + tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz ); + tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx ); + tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy ); + tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz ); + n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 ); + n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax ); + n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 ); + n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax ); + if (n0.x <= n0.y) UPDATE_HITMASK2; + if (n1.x <= n1.y) UPDATE_HITMASK3; + #endif + } + } + { // second 4 + const unsigned meta4 = as_uint( n1.w ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010; + const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 ); + const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F; + const unsigned child_bits4 = (meta4 >> 5) & 0x07070707; + const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.w : n2.y ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.y : n3.w ) ); + const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.y : n2.w ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.w : n4.y ) ); + const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.w : n3.y ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.y : n4.w ) ); + { + #ifdef SIMD_AABBTEST + const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx; + const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy; + const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz; + const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 ); + const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax ); + const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 ); + const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax ); + if (cmina <= cmaxa) UPDATE_HITMASK0; + if (cminb <= cmaxb) UPDATE_HITMASK1; + #else + float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx ); + float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy ); + float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz ); + float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx ); + float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy ); + float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz ); + n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 ); + n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax ); + n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 ); + n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax ); + if (n0.x <= n0.y) UPDATE_HITMASK0; + if (n1.x <= n1.y) UPDATE_HITMASK1; + #endif + #ifdef SIMD_AABBTEST + const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 ); + const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax ); + const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 ); + const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax ); + if (cminc <= cmaxc) UPDATE_HITMASK2; + if (cmind <= cmaxd) UPDATE_HITMASK3; + #else + tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx ); + tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy ); + tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz ); + tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx ); + tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy ); + tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz ); + n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 ); + n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax ); + n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 ); + n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax ); + if (n0.x <= n0.y) UPDATE_HITMASK2; + if (n1.x <= n1.y) UPDATE_HITMASK3; + #endif + } + } + ngroup.y = (hitmask & 0xFF000000) | (as_uint( n0.w ) >> 24), tgroup.y = hitmask & 0x00FFFFFF; + } + } + else tgroup = ngroup, ngroup = (uint2)(0); + while (tgroup.y != 0) + { + #ifdef CWBVH_COMPRESSED_TRIS + // Fast intersection of triangle data for the algorithm in: + // "Fast Ray-Triangle Intersections by Coordinate Transformation" + // Baldwin & Weber, 2016. + const unsigned triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 4; + const float4 T2 = cwbvhTris[triAddr + 2]; + const float transS = T2.x * O.x + T2.y * O.y + T2.z * O.z + T2.w; + const float transD = T2.x * D.x + T2.y * D.y + T2.z * D.z; + const float d = -transS / transD; + if (d > 0 && d < tmax) + { + const float4 T0 = cwbvhTris[triAddr + 0]; + const float4 T1 = cwbvhTris[triAddr + 1]; + #ifdef SIMD_AABBTEST + const float4 I = O4 + d * D4; + #else + const float3 I = O + d * D; + #endif + const float u = T0.x * I.x + T0.y * I.y + T0.z * I.z + T0.w; + const float v = T1.x * I.x + T1.y * I.y + T1.z * I.z + T1.w; + const bool hit = u >= 0 && v >= 0 && u + v < 1; + if (hit) return true; + } + #else + // Möller-Trumbore intersection; triangles are stored as 3x16 bytes, + // with the original primitive index in the (otherwise unused) w + // component of vertex 0. + const int triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 3; + const float3 v0 = cwbvhTris[triAddr].xyz; + const float3 e1 = cwbvhTris[triAddr + 1].xyz - v0; + const float3 e2 = cwbvhTris[triAddr + 2].xyz - v0; + #ifdef SIMD_AABBTEST + const float3 r = cross( D4.xyz, e2 ); + #else + const float3 r = cross( D, e2 ); + #endif + const float a = dot( e1, r ); + if (fabs( a ) > 0.0000001f) + { + const float f = 1 / a; + #ifdef SIMD_AABBTEST + const float3 s = O4.xyz - v0; + #else + const float3 s = O - v0; + #endif + const float u = f * dot( s, r ); + if (u >= 0 && u <= 1) + { + const float3 q = cross( s, e1 ); + #ifdef SIMD_AABBTEST + const float v = f * dot( D4.xyz, q ); + #else + const float v = f * dot( D, q ); + #endif + if (v >= 0 && u + v <= 1) + { + const float d = f * dot( e2, q ); + if (d > 0.0f && d < tmax) return true; + } + } + } + #endif + tgroup.y -= 1 << triangleIndex; + } + if (ngroup.y <= 0x00FFFFFF) + { + if (stackPtr > 0) { STACK_POP( ngroup ); } else break; + } + } while (true); + return false; // no occlusion found. +} + void kernel batch_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, global struct Ray* rayData ) { // initialize ray diff --git a/wavefront.cl b/wavefront.cl index 885944a..7b8eb63 100644 --- a/wavefront.cl +++ b/wavefront.cl @@ -1,4 +1,4 @@ -// gpu-side path tracing (wavefront) +// basic gpu-side path tracing (wavefront) #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable @@ -69,12 +69,9 @@ struct Potential }; // atomic counter management - prepare for primary ray wavefront -void kernel SetRenderData( - int primaryRayCount, - float4 eye, float4 p0, float4 p1, float4 p2, - uint frameIdx, - global float4* cwbvhNodes, - global float4* cwbvhTris +void kernel SetRenderData( int primaryRayCount, + float4 eye, float4 p0, float4 p1, float4 p2, uint frameIdx, + global float4* cwbvhNodes, global float4* cwbvhTris ) { if (get_global_id( 0 ) != 0) return; @@ -128,19 +125,18 @@ void kernel Extend( global struct PathState* raysIn ) } } -// syncing counters: at this point, we need to reset the extendTasks counter. +// syncing counters: at this point, we need to reset the extendTasks and connectTasks counters. void kernel UpdateCounters1() { if (get_global_id( 0 ) != 0) return; extendTasks = 0; + connectTasks = 0; } // shade: process intersection results; this evaluates the BRDF and creates // extension rays and shadow rays. -void kernel Shade( - global float4* accumulator, - global struct PathState* raysIn, - global struct PathState* raysOut, +void kernel Shade( global float4* accumulator, + global struct PathState* raysIn, global struct PathState* raysOut, global struct Potential* shadowOut, global float4* verts ) @@ -152,44 +148,60 @@ void kernel Shade( const int pathId = atomic_dec( &shadeTasks ) - 1; if (pathId < 0) break; // fetch path data - float4 data0 = raysIn[pathId].T; // xyz = rgb, postponed pdf in w - float4 data1 = raysIn[pathId].O; // pixel index in O.w - float4 data2 = raysIn[pathId].D; // t in D.w - float4 data3 = raysIn[pathId].hit; // dist, u, v, prim + float4 T4 = raysIn[pathId].T; // xyz = rgb, postponed pdf in w + float4 O4 = raysIn[pathId].O; // pixel index in O.w + float4 D4 = raysIn[pathId].D; // t in D.w + float4 hit = raysIn[pathId].hit; // dist, u, v, prim // prepare for shading - uint depth = as_uint( data1.w ) & 15; - uint pixelIdx = as_uint( data1.w ) >> 4; - uint seed = WangHash( as_uint( data1.w ) + rd.frameIdx * 17117); + uint depth = as_uint( O4.w ) & 15; + uint pixelIdx = as_uint( O4.w ) >> 4; + uint seed = WangHash( as_uint( O4.w ) + rd.frameIdx * 17117); + float3 T = T4.xyz; + float t = hit.x; // end path on sky - if (data3.x == 1e30f) + if (t == 1e30f) { float3 skyColor = (float3)( 0.7f, 0.7f, 1.2f ); - accumulator[pixelIdx] += (float4)( data0.xyz * skyColor, 1 ); + accumulator[pixelIdx] += (float4)( T * skyColor, 1 ); continue; } // fetch geometry at intersection point - uint vertIdx = as_uint( data3.w ) * 3; + uint vertIdx = as_uint( hit.w ) * 3; float4 v0 = verts[vertIdx]; + uint mat = as_uint( v0.w ) >> 24; + // end path on light + float3 lightColor = (float3)( 10 ); + if (mat == 1) + { + if (depth == 0) accumulator[pixelIdx] += (float4)( T * lightColor, 1 ); + continue; + } float3 vert0 = v0.xyz, vert1 = verts[vertIdx + 1].xyz, vert2 = verts[vertIdx + 2].xyz; float3 N = normalize( cross( vert1 - vert0, vert2 - vert0 ) ); - float3 D = data2.xyz; + float3 D = D4.xyz; if (dot( N, D ) > 0) N *= -1; - float3 T = data0.xyz; - float3 O = data1.xyz; - float t = data3.x; - float3 I = O + t * D; - // bounce - if (depth < 4) + float3 I = O4.xyz + t * D; + // direct illumination: next event estimation + float3 P = (float3)( RandomFloat( &seed ) * 9 - 4.5f, 30, RandomFloat( &seed ) * 5 - 3.5f ); + float3 L = P - I; + float NdotL = dot( N, L ); + if (NdotL > 0) + { + uint newShadowIdx = atomic_inc( &connectTasks ); + float dist2 = dot( L, L ), dist = sqrt( dist2 ); + L *= 1.0f / dist; + float NLdotL = fabs( L.y ); // actually, fabs( dot( L, LN ) ) + shadowOut[newShadowIdx].T = (float4)( lightColor * T * NdotL * NLdotL * (1.0f / dist2), 0 ); + shadowOut[newShadowIdx].O = (float4)( I + L * 0.001f, as_float( pixelIdx ) ); + shadowOut[newShadowIdx].D = (float4)( L, dist - 0.002f ); + } + // indirect illumination: diffuse bounce + if (depth < 2) { uint newRayIdx = atomic_inc( &extendTasks ); float3 BRDF = (float3)(1) /* just white for now */ * INVPI; - #if 0 - float3 R = DiffuseReflection( N, &seed ); - float PDF = INV2PI; - #else float3 R = CosWeightedDiffReflection( N, &seed ); float PDF = dot( N, R ) * INVPI; - #endif T *= dot( N, R ) * BRDF * (1.0f / PDF); raysOut[newRayIdx].T = (float4)( T, 1 ); raysOut[newRayIdx].O = (float4)( I + R * 0.001f, as_float( (pixelIdx << 4) + depth + 1 ) ); @@ -209,17 +221,22 @@ void kernel UpdateCounters2() // if not occluded. void kernel Connect( global float4* accumulator, global struct Potential* shadowIn ) { - // obtain task - const int rayId = atomic_dec( &connectTasks ) - 1; - if (rayId < 0) return; - const float4 T4 = shadowIn[rayId].T; - const float4 O4 = shadowIn[rayId].O; - const float4 D4 = shadowIn[rayId].D; - const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z ); - bool occluded = false; // isoccluded_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, D4.w ); - if (occluded) return; - uint pixelIdx = as_uint( O4.w ); - accumulator[pixelIdx] += T4; + while (1) + { + // obtain task + if (connectTasks < 1) break; + const int rayId = atomic_dec( &connectTasks ) - 1; + if (rayId < 0) break; + const float4 T4 = shadowIn[rayId].T; + const float4 O4 = shadowIn[rayId].O; + const float4 D4 = shadowIn[rayId].D; + const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z ); + if (!isoccluded_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, D4.w )) + { + uint pixelIdx = as_uint( O4.w ); + accumulator[pixelIdx] += T4; + } + } } // finalize: convert the accumulated values into final pixel values.