From 02e960e976ce6518b5d01a74a33ed88f3d9caaf8 Mon Sep 17 00:00:00 2001
From: jbikker <bikker.j@gmail.com>
Date: Sat, 4 Jan 2025 20:16:01 +0100
Subject: [PATCH] 1.2.0; SBVH unsplitting.

---
 README.md  |  4 +--
 tiny_bvh.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 85 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index 566ca6b..e9dcfe9 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ The **performance measurement tool** can be compiled with:
 
 ````g++ -std=c++20 -mavx -Ofast tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest````
 
-# Version 1.1.9
+# Version 1.2.0
 
 Version 1.1.0 introduced a <ins>change to the API</ins>. The single BVH class with multiple layouts has been replaced with a BVH class per layout. You can simply instantiate the desired layout; conversion (and data ownership) is then handled properly by the library. Examples:
 
@@ -114,7 +114,7 @@ This version of the library includes the following functionality:
 * Fast binned SAH BVH builder using AVX intrinsics
 * Fast binned SAH BVH builder using NEON intrinsices, by [wuyakuma](https://github.com/wuyakuma)
 * Double-precision binned SAH BVH builder
-* Spatial Splits ([SBVH](https://www.nvidia.in/docs/IO/77714/sbvh.pdf), Stich et al., 2009) builder
+* Spatial Splits ([SBVH](https://www.nvidia.in/docs/IO/77714/sbvh.pdf), Stich et al., 2009) builder, including "unsplitting"
 * 'Compressed Wide BVH' (CWBVH) data structure
 * BVH optimizer: reduces SAH cost and improves ray tracing performance ([Bittner et al., 2013](https://dspace.cvut.cz/bitstream/handle/10467/15603/2013-Fast-Insertion-Based-Optimization-of-Bounding-Volume-Hierarchies.pdf))
 * Collapse to N-wide MBVH using templated code
diff --git a/tiny_bvh.h b/tiny_bvh.h
index ec3a57b..03b7bf7 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -90,6 +90,9 @@ THE SOFTWARE.
 #define C_INT	1
 #define C_TRAV	1
 
+// SBVH: "Unsplitting"
+#define SBVH_UNSPLITTING
+
 // 'Infinity' values
 #define BVH_FAR	1e30f		// actual valid ieee range: 3.40282347E+38
 #define BVH_DBL_FAR 1e300	// actual valid ieee range: 1.797693134862315E+308
@@ -117,8 +120,8 @@ THE SOFTWARE.
 
 // library version
 #define TINY_BVH_VERSION_MAJOR	1
-#define TINY_BVH_VERSION_MINOR	1
-#define TINY_BVH_VERSION_SUB	9
+#define TINY_BVH_VERSION_MINOR	2
+#define TINY_BVH_VERSION_SUB	0
 
 // ============================================================================
 //
@@ -508,6 +511,7 @@ class BVHBase
 	bool refittable = true;			// refits are safe only if the tree has no spatial splits.
 	bool may_have_holes = false;	// threaded builds and MergeLeafs produce BVHs with unused nodes.
 	bool bvh_over_aabbs = false;	// a BVH over AABBs is useful for e.g. TLAS traversal.
+	bool bvh_over_indices = false;	// a BVH over indices cannot translate primitive index to vertex index.
 	BVHContext context;				// context used to provide user-defined allocation functions
 	// Keep track of allocated buffer size to avoid repeated allocation during layout conversion.
 	uint32_t allocatedNodes = 0;	// number of nodes allocated for the BVH.
@@ -1316,12 +1320,14 @@ void BVH::Build( const bvhvec4* vertices, const uint32_t* indices, const uint32_
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 void BVH::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
 {
 	// build the BVH from vertices stored in a slice, indexed by 'indices'.
 	PrepareBuild( vertices, indices, prims );
 	Build();
+	bvh_over_indices = true;
 }
 void BVH::PrepareBuild( const bvhvec4slice& vertices, const uint32_t* indices, const uint32_t prims )
 {
@@ -1491,6 +1497,7 @@ void BVH::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint3
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	BuildHQ( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH::BuildHQ( const bvhvec4slice& vertices )
@@ -1504,13 +1511,14 @@ void BVH::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32
 	// build the BVH from vertices stored in a slice, indexed by 'indices'.
 	PrepareHQBuild( vertices, indices, prims );
 	BuildHQ();
+	bvh_over_indices = true;
 }
 
 void BVH::PrepareHQBuild( const bvhvec4slice& vertices, const uint32_t* indices, const uint32_t prims )
 {
 	uint32_t primCount = prims > 0 ? prims : vertices.count / 3;
 	const uint32_t slack = primCount >> 1; // for split prims
-	const uint32_t spaceNeeded = primCount * 4;
+	const uint32_t spaceNeeded = primCount * 3;
 	// allocate memory on first build
 	if (allocatedNodes < spaceNeeded)
 	{
@@ -1635,7 +1643,7 @@ void BVH::BuildHQ()
 			}
 			// consider a spatial split
 			bool spatial = false;
-			uint32_t NL[HQBVHBINS - 1], NR[HQBVHBINS - 1], budget = sliceEnd - sliceStart;
+			uint32_t NL[HQBVHBINS - 1], NR[HQBVHBINS - 1], budget = sliceEnd - sliceStart, bestNL, bestNR;
 			bvhvec3 spatialUnion = bestLMax - bestRMin;
 			float spatialOverlap = (spatialUnion.halfArea()) / rootArea;
 			if (budget > node.triCount && splitCost < 1e30f && spatialOverlap > 1e-5f)
@@ -1694,6 +1702,7 @@ void BVH::BuildHQ()
 						{
 							spatial = true, splitCost = Cspatial, bestAxis = a, bestPos = i;
 							bestLMin = lBMin[i], bestLMax = lBMax[i], bestRMin = rBMin[i], bestRMax = rBMax[i];
+							bestNL = NL[i], bestNR = NR[i]; // for unsplitting
 							bestLMax[a] = bestRMin[a]; // accurate
 						}
 					}
@@ -1706,8 +1715,11 @@ void BVH::BuildHQ()
 			uint32_t A = sliceStart, B = sliceEnd, src = node.leftFirst;
 			if (spatial)
 			{
+				// spatial partitioning
 				const float planeDist = (node.aabbMax[bestAxis] - node.aabbMin[bestAxis]) / (HQBVHBINS * 0.9999f);
 				const float rPlaneDist = 1.0f / planeDist, nodeMin = node.aabbMin[bestAxis];
+				bvhvec3 finalLMin = bestLMin, finalLMax = bestLMax;
+				bvhvec3 finalRMin = bestRMin, finalRMax = bestRMax;
 				for (uint32_t i = 0; i < node.triCount; i++)
 				{
 					const uint32_t fragIdx = triIdxA[src++];
@@ -1715,6 +1727,42 @@ void BVH::BuildHQ()
 					const uint32_t bin2 = (uint32_t)((fragment[fragIdx].bmax[bestAxis] - nodeMin) * rPlaneDist);
 					if (bin2 <= bestPos) triIdxB[A++] = fragIdx; else if (bin1 > bestPos) triIdxB[--B] = fragIdx; else
 					{
+					#ifdef SBVH_UNSPLITTING
+						// unsplitting
+						if (!bvh_over_indices)
+						{
+							// 1. Calculate what happens if we add this primitive entirely to the left side
+							if (bestNR > 1)
+							{
+								bvhvec3 unsplitLMin = tinybvh_min( finalLMin, fragment[fragIdx].bmin );
+								bvhvec3 unsplitLMax = tinybvh_max( finalLMax, fragment[fragIdx].bmax );
+								float AL = (unsplitLMax - unsplitLMin).halfArea();
+								float AR = (finalRMax - finalRMin).halfArea();
+								float CunsplitLeft = C_TRAV + C_INT * rSAV * (AL * bestNL + AR * (bestNR - 1));
+								if (CunsplitLeft < splitCost)
+								{
+									bestNR--, splitCost = CunsplitLeft, triIdxB[A++] = fragIdx;
+									finalLMin = unsplitLMin, finalLMax = unsplitLMax;
+									continue;
+								}
+							}
+							// 2. Calculate what happens if we add this primitive entirely to the right side
+							if (bestNL > 1)
+							{
+								bvhvec3 unsplitRMin = tinybvh_min( finalRMin, fragment[fragIdx].bmin );
+								bvhvec3 unsplitRMax = tinybvh_max( finalRMax, fragment[fragIdx].bmax );
+								float AL = (finalLMax - finalLMin).halfArea();
+								float AR = (unsplitRMax - unsplitRMin).halfArea();
+								float CunsplitRight = C_TRAV + C_INT * rSAV * (AL * (bestNL - 1) + AR * bestNR);
+								if (CunsplitRight < splitCost)
+								{
+									bestNL--, splitCost = CunsplitRight, triIdxB[--B] = fragIdx;
+									finalRMin = unsplitRMin, finalRMax = unsplitRMax;
+									continue;
+								}
+							}
+						}
+					#endif
 						// split straddler
 						Fragment tmpFrag = fragment[fragIdx];
 						Fragment newFrag;
@@ -1724,6 +1772,8 @@ void BVH::BuildHQ()
 							triIdxB[A++] = fragIdx;
 					}
 				}
+				bestLMin = finalLMin, bestLMax = finalLMax;
+				bestRMin = finalRMin, bestRMax = finalRMax;
 			}
 			else
 			{
@@ -1775,8 +1825,9 @@ void BVH::BuildHQ()
 void BVH::Refit( const uint32_t nodeIdx )
 {
 	FATAL_ERROR_IF( !refittable, "BVH::Refit( .. ), refitting an SBVH." );
-	FATAL_ERROR_IF( bvhNode == 0, "BVH::Refit( WALD_32BYTE ), bvhNode == 0." );
-	FATAL_ERROR_IF( may_have_holes, "BVH::Refit( WALD_32BYTE ), bvh may have holes." );
+	FATAL_ERROR_IF( bvhNode == 0, "BVH::Refit( .. ), bvhNode == 0." );
+	FATAL_ERROR_IF( may_have_holes, "BVH::Refit( .. ), bvh may have holes." );
+	FATAL_ERROR_IF( bvh_over_indices, "BVH::Refit( .. ), bvh used indexed tris." );
 	for (int32_t i = usedNodes - 1; i >= 0; i--)
 	{
 		BVHNode& node = bvhNode[i];
@@ -2154,6 +2205,7 @@ void BVH_Verbose::Refit( const uint32_t nodeIdx )
 {
 	FATAL_ERROR_IF( !refittable, "BVH_Verbose::Refit( .. ), refitting an SBVH." );
 	FATAL_ERROR_IF( bvhNode == 0, "BVH_Verbose::Refit( .. ), bvhNode == 0." );
+	FATAL_ERROR_IF( bvh_over_indices, "BVH_Verbose::Refit( .. ), bvh used indexed tris." );
 	BVHNode& node = bvhNode[nodeIdx];
 	if (node.isLeaf()) // leaf: adjust to current triangle vertex positions
 	{
@@ -2352,6 +2404,7 @@ void BVH_GPU::Build( const bvhvec4* vertices, const uint32_t* indices, const uin
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH_GPU::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2360,6 +2413,7 @@ void BVH_GPU::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint
 	bvh.context = context;
 	bvh.BuildDefault( vertices, indices, prims );
 	ConvertFrom( bvh );
+	bvh_over_indices = true;
 }
 
 void BVH_GPU::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
@@ -2376,6 +2430,7 @@ void BVH_GPU::BuildHQ( const bvhvec4slice& vertices )
 void BVH_GPU::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
 {
 	BuildHQ( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH_GPU::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2383,6 +2438,7 @@ void BVH_GPU::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, ui
 	bvh.context = context;
 	bvh.BuildHQ( vertices, indices, prims );
 	ConvertFrom( bvh );
+	bvh_over_indices = true;
 }
 
 void BVH_GPU::ConvertFrom( const BVH& original )
@@ -2504,6 +2560,7 @@ void BVH_SoA::Build( const bvhvec4* vertices, const uint32_t* indices, const uin
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH_SoA::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2512,6 +2569,7 @@ void BVH_SoA::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint
 	bvh.context = context;
 	bvh.BuildDefault( vertices, indices, prims );
 	ConvertFrom( bvh );
+	bvh_over_indices = true;
 }
 
 void BVH_SoA::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
@@ -2529,6 +2587,7 @@ void BVH_SoA::BuildHQ( const bvhvec4slice& vertices )
 void BVH_SoA::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
 {
 	BuildHQ( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH_SoA::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2536,6 +2595,7 @@ void BVH_SoA::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, ui
 	bvh.context = context;
 	bvh.BuildHQ( vertices, indices, prims );
 	ConvertFrom( bvh );
+	bvh_over_indices = true;
 }
 
 void BVH_SoA::ConvertFrom( const BVH& original )
@@ -2613,6 +2673,7 @@ template<int M> void MBVH<M>::Build( const bvhvec4* vertices, const uint32_t* in
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 template<int M> void MBVH<M>::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2621,6 +2682,7 @@ template<int M> void MBVH<M>::Build( const bvhvec4slice& vertices, const uint32_
 	bvh.context = context;
 	bvh.BuildDefault( vertices, indices, prims );
 	ConvertFrom( bvh );
+	bvh_over_indices = true;
 }
 
 template<int M> void MBVH<M>::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
@@ -2638,6 +2700,7 @@ template<int M> void MBVH<M>::BuildHQ( const bvhvec4slice& vertices )
 template<int M> void MBVH<M>::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
 {
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 template<int M> void MBVH<M>::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2645,6 +2708,7 @@ template<int M> void MBVH<M>::BuildHQ( const bvhvec4slice& vertices, const uint3
 	bvh.context = context;
 	bvh.BuildHQ( vertices, indices, prims );
 	ConvertFrom( bvh );
+	bvh_over_indices = true;
 }
 
 template<int M> void MBVH<M>::ConvertFrom( const BVH& original )
@@ -2786,6 +2850,7 @@ void BVH4_CPU::Build( const bvhvec4* vertices, const uint32_t* indices, const ui
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH4_CPU::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2794,6 +2859,7 @@ void BVH4_CPU::Build( const bvhvec4slice& vertices, const uint32_t* indices, uin
 	bvh4.context = context;
 	bvh4.Build( vertices, indices, prims );
 	ConvertFrom( bvh4 );
+	bvh_over_indices = true;
 }
 
 void BVH4_CPU::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
@@ -2811,6 +2877,7 @@ void BVH4_CPU::BuildHQ( const bvhvec4slice& vertices )
 void BVH4_CPU::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
 {
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH4_CPU::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2818,6 +2885,7 @@ void BVH4_CPU::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, u
 	bvh4.context = context;
 	bvh4.BuildHQ( vertices, indices, prims );
 	ConvertFrom( bvh4 );
+	bvh_over_indices = true;
 }
 
 void BVH4_CPU::ConvertFrom( const MBVH<4>& original )
@@ -2930,6 +2998,7 @@ void BVH4_GPU::Build( const bvhvec4* vertices, const uint32_t* indices, const ui
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH4_GPU::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2938,6 +3007,7 @@ void BVH4_GPU::Build( const bvhvec4slice& vertices, const uint32_t* indices, uin
 	bvh4.context = context;
 	bvh4.Build( vertices, indices, prims );
 	ConvertFrom( bvh4 );
+	bvh_over_indices = true;
 }
 
 void BVH4_GPU::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
@@ -2955,6 +3025,7 @@ void BVH4_GPU::BuildHQ( const bvhvec4slice& vertices )
 void BVH4_GPU::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
 {
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH4_GPU::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -2962,6 +3033,7 @@ void BVH4_GPU::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, u
 	bvh4.context = context;
 	bvh4.BuildHQ( vertices, indices, prims );
 	ConvertFrom( bvh4 );
+	bvh_over_indices = true;
 }
 
 void BVH4_GPU::ConvertFrom( const MBVH<4>& original )
@@ -3259,6 +3331,7 @@ void BVH8_CWBVH::Build( const bvhvec4* vertices, const uint32_t* indices, const
 {
 	// build the BVH with a continuous array of bvhvec4 vertices, indexed by 'indices'.
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH8_CWBVH::Build( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -3267,6 +3340,7 @@ void BVH8_CWBVH::Build( const bvhvec4slice& vertices, const uint32_t* indices, u
 	bvh8.context = context;
 	bvh8.Build( vertices, indices, prims );
 	ConvertFrom( bvh8 );
+	bvh_over_indices = true;
 }
 
 void BVH8_CWBVH::BuildHQ( const bvhvec4* vertices, const uint32_t primCount )
@@ -3284,6 +3358,7 @@ void BVH8_CWBVH::BuildHQ( const bvhvec4slice& vertices )
 void BVH8_CWBVH::BuildHQ( const bvhvec4* vertices, const uint32_t* indices, const uint32_t prims )
 {
 	Build( bvhvec4slice{ vertices, prims * 3, sizeof( bvhvec4 ) }, indices, prims );
+	bvh_over_indices = true;
 }
 
 void BVH8_CWBVH::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices, uint32_t prims )
@@ -3291,6 +3366,7 @@ void BVH8_CWBVH::BuildHQ( const bvhvec4slice& vertices, const uint32_t* indices,
 	bvh8.context = context;
 	bvh8.BuildHQ( vertices, indices, prims );
 	ConvertFrom( bvh8 );
+	bvh_over_indices = true;
 }
 
 void BVH8_CWBVH::ConvertFrom( MBVH<8>& original )
@@ -5936,7 +6012,7 @@ bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhve
 	if (orig.clipped)
 	{
 		// generic case: Sutherland-Hodgeman against six bounding planes
-		bvhvec3 vin[10] = { verts[vidx], verts[vidx + 1], verts[vidx + 2] }, vout[10];
+		bvhvec3 vin[16] = { verts[vidx], verts[vidx + 1], verts[vidx + 2] }, vout[16];
 		for (uint32_t a = 0; a < 3; a++)
 		{
 			const float eps = minDim.cell[a];