diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp
index ff90343..e0e2ad3 100644
--- a/tiny_bvh_gpu.cpp
+++ b/tiny_bvh_gpu.cpp
@@ -30,9 +30,7 @@ static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triD
 // View pyramid for a pinhole camera
 struct RenderData
 {
-	bvhvec4 eye = bvhvec4( 0, 30, 0, 0 );
-	bvhvec4 view = bvhvec4( -1, 0, 0, 0 );
-	bvhvec4 C, p0, p1, p2;
+	bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ), view = bvhvec4( -1, 0, 0, 0 ), C, p0, p1, p2;
 	uint32_t frameIdx, dummy1, dummy2, dummy3;
 } rd;
 
@@ -46,6 +44,15 @@ void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, in
 	for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++)
 		*(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4;
 }
+void AddQuad( const bvhvec3 pos, const float w, const float d, int c )
+{
+	bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (triCount + 2) * 48 );
+	if (tris) memcpy( data + 6, tris, triCount * 48 ), tinybvh::free64( tris );
+	data[0] = bvhvec3( -w, 0, -d ), data[1] = bvhvec3( w, 0, -d );
+	data[2] = bvhvec3( w, 0, d ), data[3] = bvhvec3( -w, 0, -d ), tris = data;
+	data[4] = bvhvec3( w, 0, d ), data[5] = bvhvec3( -w, 0, d ), triCount += 2;
+	for( int i = 0; i < 6; i++ ) data[i] = 0.5f * data[i] + pos, data[i].w = *(float*)&c; 
+}
 
 Buffer* cwbvhNodes = 0;
 Buffer* cwbvhTris = 0;
@@ -73,6 +80,7 @@ void Init()
 	// load raw vertex data
 	AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff );
 	AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff );
+	AddQuad( bvhvec3( 0, 30, -1 ), 9, 5, 0x1ffffff ); 
 	// build bvh (here: 'compressed wide bvh', for efficient GPU rendering)
 	bvh.Build( tris, triCount );
 	// create gpu buffers
@@ -99,12 +107,9 @@ bool UpdateCamera( float delta_time_s, fenster& f )
 	bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right );
 	// get camera controls.
 	bool moved = false;
-	if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true;
-	if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true;
-	if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true;
-	if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true;
-	if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true;
-	if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true;
+	if (f.keys['A'] || f.keys['D']) rd.eye += right * delta_time_s * (f.keys['D'] ? 10 : -10), moved = true;
+	if (f.keys['W'] || f.keys['S']) rd.eye += rd.view * delta_time_s * (f.keys['W'] ? 10 : -10), moved = true;
+	if (f.keys['R'] || f.keys['F']) rd.eye += up * delta_time_s * (f.keys['R'] ? 20 : -20), moved = true;
 	if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true;
 	if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true;
 	if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true;
@@ -132,7 +137,7 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf )
 	init->Run( 1 ); // init atomic counters, set buffer ptrs etc.
 	generate->SetArguments( raysOut );
 	generate->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) );
-	for (int i = 0; i < 4; i++)
+	for (int i = 0; i < 2; i++)
 	{
 		swap( raysOut, raysIn );
 		extend->SetArguments( raysIn );
diff --git a/traverse.cl b/traverse.cl
index 0011022..35906d1 100644
--- a/traverse.cl
+++ b/traverse.cl
@@ -832,6 +832,254 @@ float4 traverse_cwbvh( global const float4* cwbvhNodes, global const float4* cwb
 	return hit;
 }
 
+bool isoccluded_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, const float3 O, const float3 D, const float3 rD, const float t )
+{
+	// initialize ray
+	const unsigned threadId = get_global_id( 0 );
+#ifdef SIMD_AABBTEST
+	const float4 O4 = (float4)( O, 1 ); // rayData[threadId].O;
+	const float4 D4 = (float4)( D, 0 ); // rayData[threadId].D;
+	const float4 rD4 = (float4)( rD, 1 ); // rayData[threadId].rD;
+#endif // otherwise, we'll use the float3 input directly.
+	// prepare traversal
+	uint2 stack[STACK_SIZE];
+	uint stackPtr = 0;
+	float tmax = t;
+#ifdef SIMD_AABBTEST
+	const uint octinv4 = (7 - ((D4.x < 0 ? 4 : 0) | (D4.y < 0 ? 2 : 0) | (D4.z < 0 ? 1 : 0))) * 0x1010101;
+#else
+	const uint octinv4 = (7 - ((D.x < 0 ? 4 : 0) | (D.y < 0 ? 2 : 0) | (D.z < 0 ? 1 : 0))) * 0x1010101;
+#endif
+	uint2 ngroup = (uint2)(0, 0b10000000000000000000000000000000), tgroup = (uint2)(0);
+	do
+	{
+		if (ngroup.y > 0x00FFFFFF)
+		{
+			const unsigned hits = ngroup.y, imask = ngroup.y;
+			const unsigned child_bit_index = __bfind( hits );
+			const unsigned child_node_base_index = ngroup.x;
+			ngroup.y &= ~(1 << child_bit_index);
+			if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( ngroup ); }
+			{
+				const unsigned slot_index = (child_bit_index - 24) ^ (octinv4 & 255);
+				const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) );
+				const unsigned child_node_index = child_node_base_index + relative_index;
+			#ifdef USE_VLOAD_VSTORE
+				const float* p = (float*)&cwbvhNodes[child_node_index * 5 + 0];
+				float4 n0 = vload4( 0, p ), n1 = vload4( 1, p ), n2 = vload4( 2, p );
+				float4 n3 = vload4( 3, p ), n4 = vload4( 4, p );
+			#else
+				float4 n0 = cwbvhNodes[child_node_index * 5 + 0], n1 = cwbvhNodes[child_node_index * 5 + 1];
+				float4 n2 = cwbvhNodes[child_node_index * 5 + 2], n3 = cwbvhNodes[child_node_index * 5 + 3];
+				float4 n4 = cwbvhNodes[child_node_index * 5 + 4];
+			#endif
+				const char4 e = as_char4( n0.w );
+				ngroup.x = as_uint( n1.x ), tgroup = (uint2)(as_uint( n1.y ), 0);
+				unsigned hitmask = 0;
+			#ifdef SIMD_AABBTEST
+				const float4 idir4 = (float4)(
+					as_float( (e.x + 127) << 23 ) * rD4.x, as_float( (e.y + 127) << 23 ) * rD4.y,
+					as_float( (e.z + 127) << 23 ) * rD4.z, 1
+				);
+				const float4 orig4 = (n0 - O4) * rD4;
+			#else
+				const float idirx = as_float( (e.x + 127) << 23 ) * rD.x;
+				const float idiry = as_float( (e.y + 127) << 23 ) * rD.y;
+				const float idirz = as_float( (e.z + 127) << 23 ) * rD.z;
+				const float origx = (n0.x - O.x) * rD.x;
+				const float origy = (n0.y - O.y) * rD.y;
+				const float origz = (n0.z - O.z) * rD.z;
+			#endif
+				{	// first 4
+					const unsigned meta4 = as_uint( n1.z ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
+					const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
+					const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
+					const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
+					const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.z : n2.x ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.x : n3.z ) );
+					const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.x : n2.z ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.z : n4.x ) );
+					const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.z : n3.x ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.x : n4.z ) );
+					{
+					#ifdef SIMD_AABBTEST
+						const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
+						const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
+						const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
+						const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
+						const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
+						const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
+						const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
+						if (cmina <= cmaxa) UPDATE_HITMASK;
+						if (cminb <= cmaxb) UPDATE_HITMASK1;
+					#else
+						float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
+						float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
+						float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
+						float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
+						float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
+						float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK;
+						if (n1.x <= n1.y) UPDATE_HITMASK1;
+					#endif
+					#ifdef SIMD_AABBTEST
+						const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
+						const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
+						const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
+						const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
+						if (cminc <= cmaxc) UPDATE_HITMASK2;
+						if (cmind <= cmaxd) UPDATE_HITMASK3;
+					#else
+						tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
+						tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
+						tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
+						tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
+						tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
+						tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK2;
+						if (n1.x <= n1.y) UPDATE_HITMASK3;
+					#endif
+					}
+				}
+				{	// second 4
+					const unsigned meta4 = as_uint( n1.w ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
+					const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
+					const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
+					const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
+					const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.w : n2.y ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.y : n3.w ) );
+					const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.y : n2.w ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.w : n4.y ) );
+					const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.w : n3.y ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.y : n4.w ) );
+					{
+					#ifdef SIMD_AABBTEST
+						const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
+						const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
+						const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
+						const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
+						const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
+						const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
+						const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
+						if (cmina <= cmaxa) UPDATE_HITMASK0;
+						if (cminb <= cmaxb) UPDATE_HITMASK1;
+					#else
+						float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
+						float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
+						float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
+						float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
+						float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
+						float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK0;
+						if (n1.x <= n1.y) UPDATE_HITMASK1;
+					#endif
+					#ifdef SIMD_AABBTEST
+						const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
+						const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
+						const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
+						const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
+						if (cminc <= cmaxc) UPDATE_HITMASK2;
+						if (cmind <= cmaxd) UPDATE_HITMASK3;
+					#else
+						tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
+						tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
+						tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
+						tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
+						tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
+						tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK2;
+						if (n1.x <= n1.y) UPDATE_HITMASK3;
+					#endif
+					}
+				}
+				ngroup.y = (hitmask & 0xFF000000) | (as_uint( n0.w ) >> 24), tgroup.y = hitmask & 0x00FFFFFF;
+			}
+		}
+		else tgroup = ngroup, ngroup = (uint2)(0);
+		while (tgroup.y != 0)
+		{
+		#ifdef CWBVH_COMPRESSED_TRIS
+			// Fast intersection of triangle data for the algorithm in:
+			// "Fast Ray-Triangle Intersections by Coordinate Transformation"
+			// Baldwin & Weber, 2016.
+			const unsigned triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 4;
+			const float4 T2 = cwbvhTris[triAddr + 2];
+			const float transS = T2.x * O.x + T2.y * O.y + T2.z * O.z + T2.w;
+			const float transD = T2.x * D.x + T2.y * D.y + T2.z * D.z;
+			const float d = -transS / transD;
+			if (d > 0 && d < tmax)
+			{
+				const float4 T0 = cwbvhTris[triAddr + 0];
+				const float4 T1 = cwbvhTris[triAddr + 1];
+			#ifdef SIMD_AABBTEST
+				const float4 I = O4 + d * D4;
+			#else
+				const float3 I = O + d * D;
+			#endif
+				const float u = T0.x * I.x + T0.y * I.y + T0.z * I.z + T0.w;
+				const float v = T1.x * I.x + T1.y * I.y + T1.z * I.z + T1.w;
+				const bool hit = u >= 0 && v >= 0 && u + v < 1;
+				if (hit) return true;
+			}
+		#else
+			// Möller-Trumbore intersection; triangles are stored as 3x16 bytes,
+			// with the original primitive index in the (otherwise unused) w 
+			// component of vertex 0.
+			const int triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 3;
+			const float3 v0 = cwbvhTris[triAddr].xyz;
+			const float3 e1 = cwbvhTris[triAddr + 1].xyz - v0;
+			const float3 e2 = cwbvhTris[triAddr + 2].xyz - v0;
+		#ifdef SIMD_AABBTEST
+			const float3 r = cross( D4.xyz, e2 );
+		#else
+			const float3 r = cross( D, e2 );
+		#endif
+			const float a = dot( e1, r );
+			if (fabs( a ) > 0.0000001f)
+			{
+				const float f = 1 / a;
+			#ifdef SIMD_AABBTEST
+				const float3 s = O4.xyz - v0;
+			#else
+				const float3 s = O - v0;
+			#endif
+				const float u = f * dot( s, r );
+				if (u >= 0 && u <= 1)
+				{
+					const float3 q = cross( s, e1 );
+				#ifdef SIMD_AABBTEST
+					const float v = f * dot( D4.xyz, q );
+				#else
+					const float v = f * dot( D, q );
+				#endif
+					if (v >= 0 && u + v <= 1)
+					{
+						const float d = f * dot( e2, q );
+						if (d > 0.0f && d < tmax) return true;
+					}
+				}
+			}
+		#endif
+			tgroup.y -= 1 << triangleIndex;
+		}
+		if (ngroup.y <= 0x00FFFFFF)
+		{
+			if (stackPtr > 0) { STACK_POP( ngroup ); } else break;
+		}
+	} while (true);
+	return false; // no occlusion found.
+}
+
 void kernel batch_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, global struct Ray* rayData )
 {
 	// initialize ray
diff --git a/wavefront.cl b/wavefront.cl
index 885944a..7b8eb63 100644
--- a/wavefront.cl
+++ b/wavefront.cl
@@ -1,4 +1,4 @@
-// gpu-side path tracing (wavefront)
+// basic gpu-side path tracing (wavefront)
 
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 
@@ -69,12 +69,9 @@ struct Potential
 };
 
 // atomic counter management - prepare for primary ray wavefront
-void kernel SetRenderData( 
-	int primaryRayCount,
-	float4 eye, float4 p0, float4 p1, float4 p2,
-	uint frameIdx,
-	global float4* cwbvhNodes,
-	global float4* cwbvhTris
+void kernel SetRenderData( int primaryRayCount,
+	float4 eye, float4 p0, float4 p1, float4 p2, uint frameIdx,
+	global float4* cwbvhNodes, global float4* cwbvhTris
 )
 {
 	if (get_global_id( 0 ) != 0) return;
@@ -128,19 +125,18 @@ void kernel Extend( global struct PathState* raysIn )
 	}
 }
 
-// syncing counters: at this point, we need to reset the extendTasks counter.
+// syncing counters: at this point, we need to reset the extendTasks and connectTasks counters.
 void kernel UpdateCounters1()
 {
 	if (get_global_id( 0 ) != 0) return;
 	extendTasks = 0;
+	connectTasks = 0;
 }
 
 // shade: process intersection results; this evaluates the BRDF and creates 
 // extension rays and shadow rays.
-void kernel Shade( 
-	global float4* accumulator, 
-	global struct PathState* raysIn, 
-	global struct PathState* raysOut, 
+void kernel Shade( global float4* accumulator, 
+	global struct PathState* raysIn, global struct PathState* raysOut, 
 	global struct Potential* shadowOut,
 	global float4* verts
 )
@@ -152,44 +148,60 @@ void kernel Shade(
 		const int pathId = atomic_dec( &shadeTasks ) - 1;
 		if (pathId < 0) break;
 		// fetch path data
-		float4 data0 = raysIn[pathId].T;	// xyz = rgb, postponed pdf in w
-		float4 data1 = raysIn[pathId].O;	// pixel index in O.w
-		float4 data2 = raysIn[pathId].D;	// t in D.w
-		float4 data3 = raysIn[pathId].hit;	// dist, u, v, prim
+		float4 T4 = raysIn[pathId].T;	// xyz = rgb, postponed pdf in w
+		float4 O4 = raysIn[pathId].O;	// pixel index in O.w
+		float4 D4 = raysIn[pathId].D;	// t in D.w
+		float4 hit = raysIn[pathId].hit;	// dist, u, v, prim
 		// prepare for shading
-		uint depth = as_uint( data1.w ) & 15;
-		uint pixelIdx = as_uint( data1.w ) >> 4;
-		uint seed = WangHash( as_uint( data1.w ) + rd.frameIdx * 17117);
+		uint depth = as_uint( O4.w ) & 15;
+		uint pixelIdx = as_uint( O4.w ) >> 4;
+		uint seed = WangHash( as_uint( O4.w ) + rd.frameIdx * 17117);
+		float3 T = T4.xyz;
+		float t = hit.x;
 		// end path on sky
-		if (data3.x == 1e30f)
+		if (t == 1e30f)
 		{
 			float3 skyColor = (float3)( 0.7f, 0.7f, 1.2f );
-			accumulator[pixelIdx] += (float4)( data0.xyz * skyColor, 1 );
+			accumulator[pixelIdx] += (float4)( T * skyColor, 1 );
 			continue;
 		}
 		// fetch geometry at intersection point
-		uint vertIdx = as_uint( data3.w ) * 3;
+		uint vertIdx = as_uint( hit.w ) * 3;
 		float4 v0 = verts[vertIdx];
+		uint mat = as_uint( v0.w ) >> 24;
+		// end path on light
+		float3 lightColor = (float3)( 10 );
+		if (mat == 1)
+		{
+			if (depth == 0) accumulator[pixelIdx] += (float4)( T * lightColor, 1 );
+			continue;
+		}
 		float3 vert0 = v0.xyz, vert1 = verts[vertIdx + 1].xyz, vert2 = verts[vertIdx + 2].xyz;
 		float3 N = normalize( cross( vert1 - vert0, vert2 - vert0 ) );
-		float3 D = data2.xyz;
+		float3 D = D4.xyz;
 		if (dot( N, D ) > 0) N *= -1;
-		float3 T = data0.xyz;
-		float3 O = data1.xyz;
-		float t = data3.x;
-		float3 I = O + t * D;
-		// bounce
-		if (depth < 4)
+		float3 I = O4.xyz + t * D;
+		// direct illumination: next event estimation
+		float3 P = (float3)( RandomFloat( &seed ) * 9 - 4.5f, 30, RandomFloat( &seed ) * 5 - 3.5f );
+		float3 L = P - I;
+		float NdotL = dot( N, L );
+		if (NdotL > 0)
+		{
+			uint newShadowIdx = atomic_inc( &connectTasks );
+			float dist2 = dot( L, L ), dist = sqrt( dist2 );
+			L *= 1.0f / dist;
+			float NLdotL = fabs( L.y ); // actually, fabs( dot( L, LN ) )
+			shadowOut[newShadowIdx].T = (float4)( lightColor * T * NdotL * NLdotL * (1.0f / dist2), 0 );
+			shadowOut[newShadowIdx].O = (float4)( I + L * 0.001f, as_float( pixelIdx ) );
+			shadowOut[newShadowIdx].D = (float4)( L, dist - 0.002f );
+		}
+		// indirect illumination: diffuse bounce
+		if (depth < 2)
 		{
 			uint newRayIdx = atomic_inc( &extendTasks );
 			float3 BRDF = (float3)(1) /* just white for now */ * INVPI;
-		#if 0
-			float3 R = DiffuseReflection( N, &seed );
-			float PDF = INV2PI;
-		#else
 			float3 R = CosWeightedDiffReflection( N, &seed );
 			float PDF = dot( N, R ) * INVPI;
-		#endif
 			T *= dot( N, R ) * BRDF * (1.0f / PDF);
 			raysOut[newRayIdx].T = (float4)( T, 1 );
 			raysOut[newRayIdx].O = (float4)( I + R * 0.001f, as_float( (pixelIdx << 4) + depth + 1 ) );
@@ -209,17 +221,22 @@ void kernel UpdateCounters2()
 // if not occluded.
 void kernel Connect( global float4* accumulator, global struct Potential* shadowIn )
 {
-	// obtain task
-	const int rayId = atomic_dec( &connectTasks ) - 1;
-	if (rayId < 0) return;
-	const float4 T4 = shadowIn[rayId].T;
-	const float4 O4 = shadowIn[rayId].O;
-	const float4 D4 = shadowIn[rayId].D;
-	const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z );
-	bool occluded = false; // isoccluded_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, D4.w );
-	if (occluded) return;
-	uint pixelIdx = as_uint( O4.w );
-	accumulator[pixelIdx] += T4;
+	while (1)
+	{
+		// obtain task
+		if (connectTasks < 1) break;
+		const int rayId = atomic_dec( &connectTasks ) - 1;
+		if (rayId < 0) break;
+		const float4 T4 = shadowIn[rayId].T;
+		const float4 O4 = shadowIn[rayId].O;
+		const float4 D4 = shadowIn[rayId].D;
+		const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z );
+		if (!isoccluded_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, D4.w ))
+		{
+			uint pixelIdx = as_uint( O4.w );
+			accumulator[pixelIdx] += T4;
+		}
+	}
 }
 
 // finalize: convert the accumulated values into final pixel values.