diff --git a/examples_tests b/examples_tests
index 4e7e5dd5f2..5f2a1f7e9d 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 4e7e5dd5f2e707399c155c0a2e5ec009a848d1e4
+Subproject commit 5f2a1f7e9d52fc2112bb24629fff2157b92ce665
diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl
index 631001686e..d80ebdf443 100644
--- a/include/nbl/builtin/hlsl/algorithm.hlsl
+++ b/include/nbl/builtin/hlsl/algorithm.hlsl
@@ -142,7 +142,9 @@ struct bound_t
 
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin)
     {
-        if (compare(accessor[testPoint],value))
+        typename Accessor::value_type val;
+        accessor.get(testPoint, val);
+        if (compare(val,value))
             it = rightBegin;
     }
     void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint)
diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl
index 71075d5491..f712c5aad5 100644
--- a/include/nbl/builtin/hlsl/bit.hlsl
+++ b/include/nbl/builtin/hlsl/bit.hlsl
@@ -3,6 +3,7 @@
 
 
 #include <nbl/builtin/hlsl/macros.h>
+#include <nbl/builtin/hlsl/cpp_compat/basic.h>
 
 
 #ifndef __HLSL_VERSION
@@ -123,5 +124,32 @@ uint16_t countl_zero(T n)
 }
 }
 #endif
- 
+
+namespace nbl
+{
+namespace hlsl
+{
+
+// Variable-width sub-word bit rotation
+template<typename T>
+NBL_CONSTEXPR_FUNC T rotl(T value, uint32_t bits, uint32_t width)
+{
+    const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1));
+    value &= mask;
+    bits &= -(bits < width);
+    return ((value << bits) | (value >> (width - bits))) & mask;
+}
+
+template<typename T>
+NBL_CONSTEXPR_FUNC T rotr(T value, uint32_t bits, uint32_t width)
+{
+    const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1));
+    value &= mask;
+    bits &= -(bits < width);
+    return ((value >> bits) | (value << (width - bits))) & mask;
+}
+
+}
+}
+
 #endif
diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl
index 118fe07c63..6a155f2b92 100644
--- a/include/nbl/builtin/hlsl/functional.hlsl
+++ b/include/nbl/builtin/hlsl/functional.hlsl
@@ -89,12 +89,23 @@ struct reference_wrapper : enable_if_t<
         return lhs OP rhs; \
     }
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T NBL_STRUCT_CONSTRAINABLE > struct NAME { \
+    using type_t = T; \
+    \
+    bool operator()(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs) \
+    { \
+        return lhs OP rhs; \
+    }
+
 
 #else // CPP
 
 #define ALIAS_STD(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
     using type_t = T;
 
+#define ALIAS_STD_CMP(NAME,OP) template<typename T> struct NAME : std::NAME<T> { \
+    using type_t = T;
+
 #endif
 
 ALIAS_STD(bit_and,&)
@@ -136,14 +147,15 @@ ALIAS_STD(divides,/)
 };
 
 
-ALIAS_STD(equal_to, ==) };
-ALIAS_STD(not_equal_to, !=) };
-ALIAS_STD(greater, >) };
-ALIAS_STD(less, <) };
-ALIAS_STD(greater_equal, >=) };
-ALIAS_STD(less_equal, <=) };
+ALIAS_STD_CMP(equal_to, ==) };
+ALIAS_STD_CMP(not_equal_to, !=) };
+ALIAS_STD_CMP(greater, >) };
+ALIAS_STD_CMP(less, <) };
+ALIAS_STD_CMP(greater_equal, >=) };
+ALIAS_STD_CMP(less_equal, <=) };
 
 #undef ALIAS_STD
+#undef ALIAS_STD_CMP
 
 // The above comparison operators return bool on STD, but in HLSL they're supposed to yield bool vectors, so here's a specialization so that they return `vector<bool, N>` for vectorial types
 
diff --git a/include/nbl/builtin/hlsl/ies/sampler.hlsl b/include/nbl/builtin/hlsl/ies/sampler.hlsl
index ab4046477c..a6309fd128 100644
--- a/include/nbl/builtin/hlsl/ies/sampler.hlsl
+++ b/include/nbl/builtin/hlsl/ies/sampler.hlsl
@@ -85,7 +85,7 @@ struct CandelaSampler
         const angle_t vAngle = degrees(polar.theta);
         const angle_t hAngle = degrees(__wrapPhi(polar.phi, symmetry));
 
-#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } };
+#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } void get(uint32_t idx, NBL_REF_ARG(value_type) val) NBL_CONST_MEMBER_FUNC { val = EXPR; } };
 
         NBL_IES_DEF_ANGLE_ACC(VAcc, acc.vAngle(idx))
         NBL_IES_DEF_ANGLE_ACC(HAcc, acc.hAngle(idx))
diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl
index b5b6f8feea..a80bdda904 100644
--- a/include/nbl/builtin/hlsl/math/functions.hlsl
+++ b/include/nbl/builtin/hlsl/math/functions.hlsl
@@ -93,13 +93,22 @@ scalar_type_t<T> lpNorm(NBL_CONST_REF_ARG(T) v)
 
 
 // valid only for `theta` in [-PI,PI]
-template <typename T NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+// UseRealSinCos=true  -> back-to-back sin + cos. Saturates the special-function pipeline, enables vendor sincos fusion, full precision near multiples of pi.
+// UseRealSinCos=false -> cos + sqrt(1-c*c) with sign recovered from theta. Saves one special-function op when cos alone is cheaper than sin+cos, but suffers catastrophic cancellation as |c| -> 1.
+template <typename T, bool UseRealSinCos = true NBL_FUNC_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c)
 {
-    s = sin<T>(theta);
-    c = cos<T>(theta);
-    // s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
-    // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    if (UseRealSinCos)
+    {
+        s = sin<T>(theta);
+        c = cos<T>(theta);
+    }
+    else
+    {
+        c = cos<T>(theta);
+        s = sqrt<T>(T(NBL_FP64_LITERAL(1.0))-c*c);
+        s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0)));
+    }
 }
 
 template <typename T NBL_FUNC_REQUIRES(vector_traits<T>::Dimension == 3)
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
index 15742e10f3..8e7b3249a0 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl
@@ -7,6 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat.hlsl>
 #include <nbl/builtin/hlsl/bit.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
 #include <nbl/builtin/hlsl/concepts/core.hlsl>
 #include <nbl/builtin/hlsl/concepts/accessors/generic_shared_data.hlsl>
 
@@ -17,84 +18,187 @@ namespace hlsl
 namespace sampling
 {
 
-// Alias Method (Vose/Walker) discrete sampler.
-//
-// Samples a discrete index in [0, N) with probability proportional to
-// precomputed weights in O(1) time per sample, using a prebuilt alias table.
-//
-// Accessor template parameters must satisfy GenericReadAccessor:
-//   accessor.template get<V, I>(index, outVal)  // void, writes to outVal
-//
-// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i
-// - AliasIndexAccessor:  reads uint32_t redirect index for bin i
-// - PdfAccessor:         reads scalar_type weight[i] / totalWeight
-//
-// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete).
-// The cache stores the PDF value looked up during generate, avoiding redundant
-// storage of the codomain (sampled index) which is already the return value.
-template<typename T, typename Domain, typename Codomain, typename ProbabilityAccessor, typename AliasIndexAccessor, typename PdfAccessor
+// Packed alias-entry bit layout shared by every packed variant. One 32-bit
+// word holds the redirect index in the low Log2N bits and the stay-
+// probability quantized as an unorm in the high (32 - Log2N) bits.
+//   u * N  = scaled;  bin = floor(scaled);  remainder = scaled - bin
+//   if (remainder < getStayProb(word))  -> result = bin
+//   else                                -> result = getTarget(word)
+// Quantizing the threshold to (32 - Log2N) bits is precision-neutral: `u`
+// already consumed Log2N bits of randomness producing `bin`, so `remainder`
+// carries exactly that many bits of discriminatory power.
+namespace impl
+{
+template<uint32_t Log2N>
+struct AliasBitDecoder
+{
+	static uint32_t getTarget(uint32_t word)
+	{
+		return word & ((1u << Log2N) - 1u);
+	}
+	template<typename T>
+	static T getStayProb(uint32_t word)
+	{
+		const uint32_t unormMax = (~0u) >> Log2N;
+		return T(word >> Log2N) / T(unormMax);
+	}
+};
+} // namespace impl
+
+// 8 B entry used by the NBig == true variant. Embeds the bin's own pdf
+// alongside the packed word so the common stay-case needs no extra tap.
+template<typename T>
+struct PackedAliasEntryB
+{
+	uint32_t packedWord;	// low Log2N: redirect target; high 32-Log2N: stayProb unorm
+	T ownPdf;				// pdf of this bin
+};
+
+
+// NBig == false: 4 B packed word per bin + separate pdf[] array. Per sample
+// = one 4 B word load + one unconditional 4 B pdf[] tap indexed by the
+// selected bin (either the current bin or its redirect). Total 8 B whether
+// the sample stays or aliases. Favours small N.
+template<typename T, typename Domain, typename Codomain, typename PackedWordAccessor, typename PdfAccessor, uint32_t Log2N
 	NBL_PRIMARY_REQUIRES(
 		concepts::UnsignedIntegralScalar<Codomain> &&
-		concepts::accessors::GenericReadAccessor<ProbabilityAccessor, T, Codomain> &&
-		concepts::accessors::GenericReadAccessor<AliasIndexAccessor, Codomain, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PackedWordAccessor, uint32_t, Codomain> &&
 		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
-struct AliasTable
+struct PackedAliasTableA
 {
 	using scalar_type = T;
-
 	using domain_type = Domain;
 	using codomain_type = Codomain;
 	using density_type = scalar_type;
 	using weight_type = density_type;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = false;
 
 	struct cache_type
 	{
 		density_type pdf;
 	};
 
-	static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size)
+	static PackedAliasTableA create(NBL_CONST_REF_ARG(PackedWordAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
 	{
-		AliasTable retval;
-		retval.probAccessor = _probAccessor;
-		retval.aliasAccessor = _aliasAccessor;
-		retval.pdfAccessor = _pdfAccessor;
-		// Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1
+		PackedAliasTableA retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
 		const scalar_type exact = scalar_type(_size);
 		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
 		return retval;
 	}
 
-	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
 		const scalar_type scaled = u * tableSizeMinusUlp;
 		const codomain_type bin = _static_cast<codomain_type>(scaled);
 		const scalar_type remainder = scaled - scalar_type(bin);
 
-		scalar_type prob;
-		probAccessor.template get<scalar_type, codomain_type>(bin, prob);
-
-		// Use if-statement to avoid select: aliasIndex is a dependent read
-		codomain_type result;
-		if (remainder < prob)
-		{
-			result = bin;
-		}
-		else
-		{
-			codomain_type alias;
-			aliasAccessor.template get<codomain_type, codomain_type>(bin, alias);
-			result = alias;
-		}
+		uint32_t packedWord;
+		entryAcc.template get<uint32_t, codomain_type>(bin, packedWord);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(packedWord), bin, codomain_type(decoder::getTarget(packedWord)));
+	}
 
+	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		const codomain_type result = generate(u);
+		pdfAcc.template get<scalar_type, codomain_type>(result, cache.pdf);
 		return result;
 	}
 
-	// TractableSampler interface
+	density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+	{
+		return cache.pdf;
+	}
+
+	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_type pdf;
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
+		return pdf;
+	}
+
+	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
+	{
+		return backwardPdf(v);
+	}
+
+	PackedWordAccessor entryAcc;
+	PdfAccessor pdfAcc;
+	scalar_type tableSizeMinusUlp;
+};
+
+// NBig == true: 8 B entry {packedWord, ownPdf} + separate pdf[] array. Per
+// sample = one 8 B entry load (covers the common stay case where cache
+// already has ownPdf). If the sample aliases, a conditional 4 B pdf[target]
+// tap fills the cache. Total 8 B stay, 12 B aliased. Favours large N.
+template<typename T, typename Domain, typename Codomain, typename EntryAccessor, typename PdfAccessor, uint32_t Log2N
+	NBL_PRIMARY_REQUIRES(
+		concepts::UnsignedIntegralScalar<Codomain> &&
+		concepts::accessors::GenericReadAccessor<EntryAccessor, PackedAliasEntryB<T>, Codomain> &&
+		concepts::accessors::GenericReadAccessor<PdfAccessor, T, Codomain>)
+struct PackedAliasTableB
+{
+	using scalar_type = T;
+	using domain_type = Domain;
+	using codomain_type = Codomain;
+	using density_type = scalar_type;
+	using weight_type = density_type;
+	using entry_type = PackedAliasEntryB<scalar_type>;
+	using decoder = impl::AliasBitDecoder<Log2N>;
+	NBL_CONSTEXPR_STATIC_INLINE bool NBig = true;
+
+	struct cache_type
+	{
+		density_type pdf;
+	};
+
+	static PackedAliasTableB create(NBL_CONST_REF_ARG(EntryAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size)
+	{
+		PackedAliasTableB retval;
+		retval.entryAcc = _entryAcc;
+		retval.pdfAcc = _pdfAcc;
+		const scalar_type exact = scalar_type(_size);
+		retval.tableSizeMinusUlp = nbl::hlsl::bit_cast<scalar_type>(nbl::hlsl::bit_cast<uint32_t>(exact) - 1u);
+		return retval;
+	}
+
+	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+	{
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+		return hlsl::select(remainder < decoder::template getStayProb<scalar_type>(entry.packedWord), bin, codomain_type(decoder::getTarget(entry.packedWord)));
+	}
+
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		const codomain_type result = generate(u);
-		pdfAccessor.template get<scalar_type, codomain_type>(result, cache.pdf);
+		const scalar_type scaled = u * tableSizeMinusUlp;
+		const codomain_type bin = _static_cast<codomain_type>(scaled);
+		const scalar_type remainder = scaled - scalar_type(bin);
+
+		entry_type entry;
+		entryAcc.template get<entry_type, codomain_type>(bin, entry);
+
+		const bool stay = remainder < decoder::template getStayProb<scalar_type>(entry.packedWord);
+		
+		cache.pdf = entry.ownPdf;
+		codomain_type result = bin;
+		if (!stay)
+		{
+			const codomain_type target = codomain_type(decoder::getTarget(entry.packedWord));
+			pdfAcc.template get<scalar_type, codomain_type>(target, cache.pdf);
+			result = target;
+		}
 		return result;
 	}
 
@@ -111,7 +215,7 @@ struct AliasTable
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
 		scalar_type pdf;
-		pdfAccessor.template get<scalar_type, codomain_type>(v, pdf);
+		pdfAcc.template get<scalar_type, codomain_type>(v, pdf);
 		return pdf;
 	}
 
@@ -120,9 +224,8 @@ struct AliasTable
 		return backwardPdf(v);
 	}
 
-	ProbabilityAccessor probAccessor;
-	AliasIndexAccessor aliasAccessor;
-	PdfAccessor pdfAccessor;
+	EntryAccessor entryAcc;
+	PdfAccessor pdfAcc;
 	scalar_type tableSizeMinusUlp;
 };
 
diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
index d02d21488c..2c7c53fd5f 100644
--- a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h
@@ -5,7 +5,12 @@
 #ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 #define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_
 
+#include <cmath>
 #include <cstdint>
+#include <span>
+#include <vector>
+
+#include <nbl/builtin/hlsl/sampling/alias_table.hlsl>
 
 namespace nbl
 {
@@ -15,74 +20,139 @@ namespace sampling
 {
 
 // Builds the alias table from an array of non-negative weights.
-// All output arrays must be pre-allocated to N entries.
 //
-// Parameters:
-//   weights         - input weights (non-negative, at least one must be > 0)
-//   N               - number of entries
-//   outProbability  - [out] alias table probability threshold per bin, in [0, 1]
-//   outAlias        - [out] alias redirect index per bin
-//   outPdf          - [out] normalized PDF per entry: weight[i] / sum(weights)
-//   workspace       - scratch buffer of N uint32_t entries
+// When `weights.size()` is a power of two, the builder transparently appends
+// one zero-weight dummy bucket so the GPU-facing table size is N+1 (odd),
+// which breaks PoT-periodic address patterns that alias memory channels /
+// cache sets on most GPUs. The sampled distribution is unchanged, the dummy
+// has stayProb = 0 and always redirects to a real donor.
+//
+// Output vectors are `resize`d by the builder to the final table size, so
+// the caller just passes (possibly empty) vectors and reads back the
+// returned size. That returned value is what to pass to the sampler's
+// `_size` argument and to use when packing / uploading.
 template<typename T>
 struct AliasTableBuilder
 {
-	static void build(std::span<const T> weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace)
-	{
-		T totalWeight = T(0);
-      for (uint32_t i = 0; i < weights.size(); i++)
-			totalWeight += weights[i];
-
-		const T rcpTotalWeight = T(1) / totalWeight;
-
-		// Compute PDFs, scaled probabilities, and partition into small/large in one pass
-		uint32_t smallEnd = 0;
-      uint32_t largeBegin = weights.size();
-      for (uint32_t i = 0; i < weights.size(); i++)
-		{
-			outPdf[i] = weights[i] * rcpTotalWeight;
-         outProbability[i] = outPdf[i] * T(weights.size());
-
-			if (outProbability[i] < T(1))
-				workspace[smallEnd++] = i;
-			else
-				workspace[--largeBegin] = i;
-		}
-
-		// Pair small and large entries
-      while (smallEnd > 0 && largeBegin < weights.size())
-		{
-			const uint32_t s = workspace[--smallEnd];
-			const uint32_t l = workspace[largeBegin];
-
-			outAlias[s] = l;
-			// outProbability[s] already holds the correct probability for bin s
-
-			outProbability[l] -= (T(1) - outProbability[s]);
-
-			if (outProbability[l] < T(1))
-			{
-				// l became small: pop from large, push to small
-				largeBegin++;
-				workspace[smallEnd++] = l;
-			}
-			// else l stays in large (don't pop, reuse next iteration)
-		}
-
-		// Remaining entries (floating point rounding artifacts)
-		while (smallEnd > 0)
-		{
-			const uint32_t s = workspace[--smallEnd];
-			outProbability[s] = T(1);
-			outAlias[s] = s;
-		}
-      while (largeBegin < weights.size())
-		{
-			const uint32_t l = workspace[largeBegin++];
-			outProbability[l] = T(1);
-			outAlias[l] = l;
-		}
-	}
+   // Ugly but much faster: we better ensure the table size is not a power of
+   // two, so we pad with +1 zero-weight dummy bucket when needed. PoT-sized
+   // alias tables hit GPU memory channel / cache set aliasing that can be
+   // wildly (sometimes 2x+) slower than a nearby non-PoT size. Builder owns
+   // all the sizing (resizes the output vectors, allocates its own scratch),
+   // so the caller can't get it wrong.
+   static uint32_t build(std::span<const T> weights, std::vector<T>& outProbability, std::vector<uint32_t>& outAlias, std::vector<T>& outPdf)
+   {
+      const uint32_t userN  = static_cast<uint32_t>(weights.size());
+      const uint32_t tableN = (userN > 1u && (userN & (userN - 1u)) == 0u) ? (userN + 1u) : userN;
+
+      outProbability.resize(tableN);
+      outAlias.resize(tableN);
+      outPdf.resize(tableN);
+      std::vector<uint32_t> workspace(tableN);
+
+      T totalWeight = T(0);
+      for (uint32_t i = 0; i < userN; i++)
+         totalWeight += weights[i];
+
+      const T rcpTotalWeight = T(1) / totalWeight;
+
+      // Compute PDFs, scaled probabilities, and partition into small/large in one pass
+      uint32_t smallEnd   = 0u;
+      uint32_t largeBegin = tableN;
+      for (uint32_t i = 0; i < userN; i++)
+      {
+         outPdf[i]         = weights[i] * rcpTotalWeight;
+         outProbability[i] = outPdf[i] * T(tableN);
+
+         if (outProbability[i] < T(1))
+            workspace[smallEnd++] = i;
+         else
+            workspace[--largeBegin] = i;
+      }
+      // PoT dodge tail: one zero-weight dummy at index userN, always in the small list.
+      if (tableN != userN)
+      {
+         outPdf[userN]         = T(0);
+         outProbability[userN] = T(0);
+         workspace[smallEnd++] = userN;
+      }
+
+      // Pair small and large entries
+      while (smallEnd > 0u && largeBegin < tableN)
+      {
+         const uint32_t s = workspace[--smallEnd];
+         const uint32_t l = workspace[largeBegin];
+
+         outAlias[s] = l;
+         // outProbability[s] already holds the correct probability for bin s
+
+         outProbability[l] -= (T(1) - outProbability[s]);
+
+         if (outProbability[l] < T(1))
+         {
+            // l became small: pop from large, push to small
+            largeBegin++;
+            workspace[smallEnd++] = l;
+         }
+         // else l stays in large (don't pop, reuse next iteration)
+      }
+
+      // Remaining entries (floating point rounding artifacts)
+      while (smallEnd > 0u)
+      {
+         const uint32_t s  = workspace[--smallEnd];
+         outProbability[s] = T(1);
+         outAlias[s]       = s;
+      }
+      while (largeBegin < tableN)
+      {
+         const uint32_t l  = workspace[largeBegin++];
+         outProbability[l] = T(1);
+         outAlias[l]       = l;
+      }
+
+      return tableN;
+   }
+
+   // Pack (target, stayProb) into a single 32-bit word with Log2N bits for
+   // target and (32 - Log2N) bits for the unorm-quantized threshold. Used by
+   // every packed variant; each packX() below calls this on a per-entry basis.
+   template<uint32_t Log2N>
+   static uint32_t packWord(uint32_t target, T stayProb)
+   {
+      const uint32_t targetMask = (Log2N == 32u) ? ~0u : ((1u << Log2N) - 1u);
+      const T        clamped    = stayProb < T(0) ? T(0) : (stayProb > T(1) ? T(1) : stayProb);
+      const uint32_t unormMax   = (Log2N == 0u) ? ~0u : ((~0u) >> Log2N);
+      const uint32_t probUnorm  = static_cast<uint32_t>(std::round(static_cast<double>(clamped) * static_cast<double>(unormMax)));
+      return (target & targetMask) | (probUnorm << Log2N);
+   }
+
+   // Variant A, pack SoA outputs into an array of 4 B packed words. The
+   // pdf[] array is consumed directly by the sampler as a second accessor.
+   // outWords must be pre-allocated to N uint32_t entries.
+   template<uint32_t Log2N>
+   static void packA(std::span<const T> probability, std::span<const uint32_t> alias, uint32_t* outWords)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+         outWords[i] = packWord<Log2N>(alias[i], probability[i]);
+   }
+
+   // Variant B, pack SoA outputs into 8 B entries { packedWord, ownPdf }.
+   // The pdf[] array is *also* passed to the sampler (same contents as ownPdf
+   // column, but tapped independently with a 4 B fetch when the sample aliases).
+   // outEntries must be pre-allocated to N entries.
+   template<uint32_t Log2N>
+   static void packB(std::span<const T> probability, std::span<const uint32_t> alias, std::span<const T> pdf,
+      PackedAliasEntryB<T>* outEntries)
+   {
+      const uint32_t N = static_cast<uint32_t>(probability.size());
+      for (uint32_t i = 0; i < N; i++)
+      {
+         outEntries[i].packedWord = packWord<Log2N>(alias[i], probability[i]);
+         outEntries[i].ownPdf     = pdf[i];
+      }
+   }
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
index 35f1391930..b56073074b 100644
--- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl
@@ -45,10 +45,10 @@ struct Bilinear
         vector2_type twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]);
         // Linear::create adds FLT_MIN internally, replicate here so both divisions share
         // the same denominator (sum + 2*min), enabling CSE to merge them into one division
-        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + scalar_type(2.0) * hlsl::numeric_limits<scalar_type>::min;
-        const scalar_type yNormFactor = scalar_type(2.0) / safeSum;
+        const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + _static_cast<scalar_type>(2.0) * hlsl::numeric_limits<scalar_type>::min;
+        const scalar_type yNormFactor = _static_cast<scalar_type>(2.0) / safeSum;
         retval.lineary = Linear<scalar_type>::create(twiceAreasUnderXCurve);
-        retval.normFactor = yNormFactor * scalar_type(2.0);
+        retval.normFactor = yNormFactor * _static_cast<scalar_type>(2.0);
         return retval;
     }
 
@@ -65,7 +65,7 @@ struct Bilinear
 
         // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches
         const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache);
-        cache.normalizedStart = yPdf * linearx.linearCoeffStart;
+        cache.normalizedStart = yPdf * linearx.normalizedCoeffStart;
         cache.linearXCache.diffTimesX *= yPdf;
         return p;
     }
diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
index b9efd1ec35..9ae0c83c05 100644
--- a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl
@@ -37,16 +37,16 @@ struct BoxMullerTransform
     {
         BoxMullerTransform<T> retval;
         retval.stddev = _stddev;
-        retval.halfRcpStddev2 = scalar_type(0.5) / (_stddev * _stddev);
+        retval.halfRcpStddev2 = _static_cast<scalar_type>(0.5) / (_stddev * _stddev);
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
         scalar_type sinPhi, cosPhi;
-        math::sincos<scalar_type>(scalar_type(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
+        math::sincos<scalar_type>(_static_cast<scalar_type>(2.0) * numbers::pi<scalar_type> * u.y - numbers::pi<scalar_type>, sinPhi, cosPhi);
         cache.direction = vector2_type(cosPhi, sinPhi);
-        return cache.direction * nbl::hlsl::sqrt(scalar_type(-2.0) * nbl::hlsl::log(u.x)) * stddev;
+        return cache.direction * nbl::hlsl::sqrt(_static_cast<scalar_type>(-2.0) * nbl::hlsl::log(u.x)) * stddev;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
index c9e5cac5d6..d31c2d994a 100644
--- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl
@@ -41,7 +41,7 @@ struct ConcentricMapping
    static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
    {
       // map [0,1]^2 to [-1,1]^2
-      const vector2_type centered = scalar_type(2) * u - hlsl::promote<vector2_type>(scalar_type(1));
+      const vector2_type centered = _static_cast<scalar_type>(2) * u - hlsl::promote<vector2_type>(_static_cast<scalar_type>(1));
 
       const scalar_type a = centered.x;
       const scalar_type b = centered.y;
@@ -51,10 +51,10 @@ struct ConcentricMapping
       const scalar_type dominant = hlsl::select(cond, a, b);
       const scalar_type minor = hlsl::select(cond, b, a);
 
-      const scalar_type safe_dominant = dominant != scalar_type(0) ? dominant : scalar_type(0);
+      const scalar_type safe_dominant = dominant != _static_cast<scalar_type>(0) ? dominant : _static_cast<scalar_type>(0);
       const scalar_type ratio = minor / safe_dominant;
 
-      const scalar_type angle = scalar_type(0.25) * numbers::pi<scalar_type> * ratio;
+      const scalar_type angle = _static_cast<scalar_type>(0.25) * numbers::pi<scalar_type> * ratio;
       const scalar_type c = hlsl::cos<scalar_type>(angle);
       const scalar_type s = hlsl::sin<scalar_type>(angle);
 
@@ -90,7 +90,7 @@ struct ConcentricMapping
       // angle in [0, pi/4]
       const scalar_type phi = hlsl::atan2(num, denom);
 
-      const scalar_type minor_val = r * phi / (scalar_type(0.25) * numbers::pi<scalar_type>);
+      const scalar_type minor_val = r * phi / (_static_cast<scalar_type>(0.25) * numbers::pi<scalar_type>);
 
       // reconstruct a,b using select instead of branching
       const scalar_type a_base = hlsl::select(swapped, minor_val, r);
@@ -99,7 +99,7 @@ struct ConcentricMapping
       const scalar_type a = ieee754::copySign(a_base, p.x);
       const scalar_type b = ieee754::copySign(b_base, p.y);
 
-      return (vector2_type(a, b) + hlsl::promote<vector2_type>(scalar_type(1))) * scalar_type(0.5);
+      return (vector2_type(a, b) + hlsl::promote<vector2_type>(_static_cast<scalar_type>(1))) * _static_cast<scalar_type>(0.5);
    }
 
    // The PDF of Shirley mapping is constant (1/PI on the unit disk)
diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
index 23e35e2f7d..52b063f448 100644
--- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl
@@ -35,8 +35,8 @@ struct ProjectedHemisphere
 
 	static codomain_type __generate(const domain_type u)
 	{
-		vector_t2 p = ConcentricMapping<T>::generate(u * T(0.99999) + T(0.000005));
-		T z = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(1.0) - p.x * p.x - p.y * p.y));
+		vector_t2 p = ConcentricMapping<T>::generate(u * _static_cast<T>(0.99999) + _static_cast<T>(0.000005));
+		T z = hlsl::sqrt<T>(hlsl::max<T>(_static_cast<T>(0.0), _static_cast<T>(1.0) - p.x * p.x - p.y * p.y));
 		return vector_t3(p.x, p.y, z);
 	}
 
@@ -93,11 +93,11 @@ struct ProjectedSphere
 	static codomain_type __generate(NBL_REF_ARG(domain_type) u)
 	{
 		vector_t3 retval = hemisphere_t::__generate(u.xy);
-		const bool chooseLower = u.z > T(0.5);
+		const bool chooseLower = u.z > _static_cast<scalar_type>(0.5);
 		retval.z = chooseLower ? (-retval.z) : retval.z;
 		if (chooseLower)
-			u.z -= T(0.5);
-		u.z *= T(2.0);
+			u.z -= _static_cast<T>(0.5);
+		u.z *= _static_cast<T>(2.0);
 		return retval;
 	}
 
@@ -110,7 +110,7 @@ struct ProjectedSphere
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * cache.z * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * cache.z * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
@@ -120,7 +120,7 @@ struct ProjectedSphere
 
 	static density_type backwardPdf(const codomain_type L)
 	{
-		return T(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * hlsl::abs(L.z) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type L)
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
index 1f176f9713..d3657babb1 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl
@@ -16,18 +16,38 @@ namespace hlsl
 namespace sampling
 {
 
-// Discrete sampler using cumulative probability lookup via upper_bound.
+// Discrete sampler using cumulative probability lookup.
 //
 // Samples a discrete index in [0, N) with probability proportional to
 // precomputed weights in O(log N) time per sample.
 //
-// The cumulative probability array stores N-1 entries (the last bucket
-// is always 1.0 and need not be stored). Entry i holds the sum of
-// probabilities for indices [0, i].
+// Three layouts / cache-population strategies, selected by the Mode parameter:
+//
+//   TRACKING (default):  N-1 CDF entries, last bucket implicit at 1.0.
+//                        A stateful comparator records the straddling CDF
+//                        values during upper_bound itself.
+//   YOLO:                Same storage. Plain upper_bound followed by two
+//                        re-reads of the adjacent CDF entries (warm cache).
+//                        Lower register footprint, two extra array reads.
+//   EYTZINGER:           Level-order implicit binary tree in 2*P entries
+//                        where P = roundUpPot(N). Leaves at [P, P+N) hold
+//                        the CDF; interior nodes at [1, P) hold split keys.
+//                        Descent reads adjacent memory at each step, so
+//                        every cache line pulled is fully utilised and the
+//                        first log2(subgroupSize) iterations are served by a
+//                        single transaction per subgroup. Build with
+//                        sampling::buildEytzinger<T>().
 //
 // Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler:
 // the mapping is discrete).
-template<typename T, typename Domain, typename Codomain, typename CumProbAccessor
+enum CumulativeProbabilityMode : uint32_t
+{
+	TRACKING  = 0u,
+	YOLO      = 1u,
+	EYTZINGER = 2u
+};
+
+template<typename T, typename Domain, typename Codomain, typename CumProbAccessor, CumulativeProbabilityMode Mode = CumulativeProbabilityMode::TRACKING
 	NBL_PRIMARY_REQUIRES(concepts::accessors::GenericReadAccessor<CumProbAccessor, T, Codomain>)
 struct CumulativeProbabilitySampler
 {
@@ -44,58 +64,116 @@ struct CumulativeProbabilitySampler
 		density_type upperBound;
 	};
 
+	// `_size` is the user-facing bucket count N for every mode. TRACKING / YOLO
+	// expect the accessor to hold N-1 CDF entries; EYTZINGER expects 2*P entries
+	// in the level-order layout produced by buildEytzinger.
 	static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size)
 	{
 		CumulativeProbabilitySampler retval;
 		retval.cumProbAccessor = _cumProbAccessor;
 		retval.storedCount = _size - 1u;
+		retval.depth = 0u;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			uint32_t P = 1u;
+			uint32_t d = 0u;
+			while (P < _size) { P <<= 1u; ++d; }
+			retval.depth = d;
+		}
 		return retval;
 	}
 
 	// BasicSampler interface
 	codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC
 	{
-		// upper_bound returns first index where cumProb > u
-		return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
+		{
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
+			{
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				index = (index << 1u) | uint32_t(!(u < key));
+			}
+			const codomain_type result = codomain_type(index - leafBase);
+			return result < codomain_type(storedCount) ? result : codomain_type(storedCount);
+		}
+		else
+		{
+			return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+		}
 	}
 
 	// TractableSampler interface
 	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
 	{
-		// #define NBL_CUMPROB_YOLO_READS
-#ifdef NBL_CUMPROB_YOLO_READS
-		// YOLO approach: re-read the array after binary search.
-		// The accessed elements are adjacent to the found index so the cache is warm.
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
-		cache.oneBefore = density_type(0.0);
-		if (result)
-			cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
-		cache.upperBound = density_type(1.0);
-		if (result < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
-#else
-		// Tracking reads approach: stateful comparator captures CDF values during binary search.
-		struct CdfComparator
+		codomain_type result;
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			bool operator()(const density_type value, const density_type rhs)
+			// Descent visits one interior node per level. Going left tightens
+			// the upper bound to the current key; going right tightens the
+			// lower bound. Final index, leafBase is the bucket.
+			cache.oneBefore = _static_cast<density_type>(0.0);
+			cache.upperBound = _static_cast<density_type>(1.0);
+			const uint32_t leafBase = 1u << depth;
+			uint32_t index = 1u;
+			for (uint32_t iter = 0u; iter < depth; ++iter)
 			{
-				const bool retval = value < rhs;
-				if (retval)
-					upperBound = rhs;
+				density_type key;
+				cumProbAccessor.template get<density_type, uint32_t>(index, key);
+				const bool goRight = !(u < key);
+				if (goRight)
+				{
+					cache.oneBefore = key;
+					index = (index << 1u) | 1u;
+				}
 				else
-					oneBefore = rhs;
-				return retval;
+				{
+					cache.upperBound = key;
+					index = (index << 1u);
+				}
 			}
-
-			density_type oneBefore;
-			density_type upperBound;
-		} comp;
-		comp.oneBefore = density_type(0.0);
-		comp.upperBound = density_type(1.0);
-		const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
-		cache.oneBefore = comp.oneBefore;
-		cache.upperBound = comp.upperBound;
-#endif
+			const codomain_type raw = codomain_type(index - leafBase);
+			result = raw < codomain_type(storedCount) ? raw : codomain_type(storedCount);
+		}
+		else NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::YOLO)
+		{
+			// Re-read the two adjacent CDF entries after the binary search.
+			// Both sit on the cache lines the search just touched, so they are warm.
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u);
+			cache.oneBefore = _static_cast<density_type>(0.0);
+			if (result)
+				cumProbAccessor.template get<density_type, codomain_type>(result - 1u, cache.oneBefore);
+			cache.upperBound = _static_cast<density_type>(1.0);
+			if (result < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(result, cache.upperBound);
+		}
+		else
+		{
+			// TRACKING: stateful comparator captures the CDF values straddling the
+			// found index during the binary search itself, avoiding the two extra reads.
+			struct CdfComparator
+			{
+				bool operator()(const density_type value, const density_type rhs)
+				{
+					const bool retval = value < rhs;
+					if (retval)
+						upperBound = rhs;
+					else
+						oneBefore = rhs;
+					return retval;
+				}
+
+				density_type oneBefore;
+				density_type upperBound;
+			} comp;
+			comp.oneBefore = _static_cast<density_type>(0.0);
+			comp.upperBound = _static_cast<density_type>(1.0);
+			result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp);
+			cache.oneBefore = comp.oneBefore;
+			cache.upperBound = comp.upperBound;
+		}
 		return result;
 	}
 
@@ -111,16 +189,34 @@ struct CumulativeProbabilitySampler
 
 	density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC
 	{
-		density_type retval = density_type(1.0);
-		if (v < storedCount)
-			cumProbAccessor.template get<density_type, codomain_type>(v, retval);
-		if (v)
+		NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER)
 		{
-			density_type prev;
-			cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
-			retval -= prev;
+			// Leaves store the CDF directly; the last real leaf is normalized
+			// to 1.0 and padded leaves (if any) also hold 1.0.
+			const uint32_t leafBase = 1u << depth;
+			density_type retval;
+			cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v), retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, uint32_t>(leafBase + uint32_t(v) - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
+		}
+		else
+		{
+			density_type retval = _static_cast<density_type>(1.0);
+			if (v < storedCount)
+				cumProbAccessor.template get<density_type, codomain_type>(v, retval);
+			if (v)
+			{
+				density_type prev;
+				cumProbAccessor.template get<density_type, codomain_type>(v - 1u, prev);
+				retval -= prev;
+			}
+			return retval;
 		}
-		return retval;
 	}
 
 	weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC
@@ -129,7 +225,8 @@ struct CumulativeProbabilitySampler
 	}
 
 	CumProbAccessor cumProbAccessor;
-	uint32_t storedCount;
+	uint32_t storedCount;    // N - 1 (last real bucket index)
+	uint32_t depth;          // EYTZINGER only: ceil(log2(N)), iteration count; leafBase = 1 << depth
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
index a511fc2d8c..bf98d5ec93 100644
--- a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
+++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h
@@ -30,6 +30,61 @@ void computeNormalizedCumulativeHistogram(std::span<const T> weights, T* outCumP
 	std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; });
 }
 
+// Returns the next power of two >= x (and 1 for x <= 1). Matches the leaf-count
+// the Eytzinger builder pads to.
+inline uint32_t eytzingerLeafCount(uint32_t N)
+{
+	uint32_t P = 1u;
+	while (P < N) P <<= 1u;
+	return P;
+}
+
+// Builds an Eytzinger-layout CDF for cache-friendly binary search on the GPU.
+//
+// Layout (1-indexed, size 2*P where P = eytzingerLeafCount(N)):
+//   tree[0]           unused (keeps parent/child arithmetic branch-free)
+//   tree[1 .. P-1]    interior split keys; tree[v] == rightmost leaf of v's left subtree
+//   tree[P .. P+N-1]  leaves, tree[P + i] = normalized inclusive scan of weights up to i
+//   tree[P+N .. 2P-1] padded leaves, all 1.0 (any u < 1.0 routes away from these)
+//
+// The sampler walks the tree as index = (index << 1) | goRight for ceil(log2(N))
+// iterations. Successive taps within one search land on adjacent memory, so every
+// cache line pulled is fully used and the first log2(subgroupSize) iterations are
+// served by a single memory transaction per subgroup.
+template<typename T>
+void buildEytzinger(std::span<const T> weights, T* outTree)
+{
+	const uint32_t N = static_cast<uint32_t>(weights.size());
+	if (N == 0)
+		return;
+
+	const uint32_t P = eytzingerLeafCount(N);
+
+	T total = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+		total += weights[i];
+	const T rcpTotal = T(1) / total;
+
+	T acc = T(0);
+	for (uint32_t i = 0; i < N; ++i)
+	{
+		acc += weights[i];
+		outTree[P + i] = acc * rcpTotal;
+	}
+	for (uint32_t i = N; i < P; ++i)
+		outTree[P + i] = T(1);
+
+	// Bottom-up: each interior node copies the rightmost leaf of its left subtree,
+	// found by descending left-then-always-right from v.
+	for (uint32_t v = P; v-- > 1u;)
+	{
+		uint32_t r = v << 1u;
+		while (r < P)
+			r = (r << 1u) | 1u;
+		outTree[v] = outTree[r];
+	}
+}
+
 } // namespace sampling
 } // namespace hlsl
 } // namespace nbl
diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl
index 12602b4a79..cd260cea3a 100644
--- a/include/nbl/builtin/hlsl/sampling/linear.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl
@@ -39,14 +39,14 @@ struct Linear
         // add min to both coefficients so (0,0) input produces a valid uniform sampler
         // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs
         const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits<scalar_type>::min, hlsl::numeric_limits<scalar_type>::min);
-        // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x
-        const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
+        // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x
+        const scalar_type normFactor = _static_cast<scalar_type>(2.0) / (safeCoeffs[0] + safeCoeffs[1]);
         const vector2_type normalized = safeCoeffs * normFactor;
-        retval.linearCoeffStart = normalized[0];
-        retval.linearCoeffEnd = normalized[1];
+        retval.normalizedCoeffStart = normalized[0];
+        retval.normalizedCoeffEnd = normalized[1];
         // precompute for the stable quadratic in generate()
         retval.squaredCoeffStart = normalized[0] * normalized[0];
-        retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]);
+        retval.twoTimesDiff = _static_cast<scalar_type>(2.0) * (normalized[1] - normalized[0]);
         return retval;
     }
 
@@ -57,18 +57,18 @@ struct Linear
         // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is
         // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels.
         const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u);
-        const scalar_type denom = linearCoeffStart + sqrtTerm;
+        const scalar_type denom = normalizedCoeffStart + sqrtTerm;
         // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing
         // non-negative PDF at the boundary should clamp with min(x, 1).
         const codomain_type x = (u + u) / denom;
         // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul
-        cache.diffTimesX = sqrtTerm - linearCoeffStart;
+        cache.diffTimesX = sqrtTerm - normalizedCoeffStart;
         return x;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return linearCoeffStart + cache.diffTimesX;
+        return normalizedCoeffStart + cache.diffTimesX;
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
@@ -82,8 +82,8 @@ struct Linear
     // Not used because we already store start for generate().
     density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC
     {
-        assert(x >= scalar_type(0.0) && x <= scalar_type(1.0));
-        return hlsl::mix(linearCoeffStart, linearCoeffEnd, x);
+        assert(x >= _static_cast<scalar_type>(0.0) && x <= _static_cast<scalar_type>(1.0));
+        return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x);
     }
 
     weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC
@@ -91,8 +91,8 @@ struct Linear
         return backwardPdf(x);
     }
 
-    scalar_type linearCoeffStart;
-    scalar_type linearCoeffEnd;
+    scalar_type normalizedCoeffStart;
+    scalar_type normalizedCoeffEnd;
     scalar_type squaredCoeffStart;
     scalar_type twoTimesDiff;
 };
diff --git a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
index 719eaf504f..64d5e69b96 100644
--- a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl
@@ -32,7 +32,7 @@ struct PolarMapping
 	static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache)
 	{
 		const scalar_type r = hlsl::sqrt<scalar_type>(u.x);
-		const scalar_type phi = scalar_type(2) * numbers::pi<scalar_type> * u.y;
+		const scalar_type phi = _static_cast<scalar_type>(2) * numbers::pi<scalar_type> * u.y;
 		return vector2_type(r * hlsl::cos<scalar_type>(phi), r * hlsl::sin<scalar_type>(phi));
 	}
 
@@ -46,8 +46,8 @@ struct PolarMapping
 	{
 		const scalar_type r2 = p.x * p.x + p.y * p.y;
 		scalar_type phi = hlsl::atan2(p.y, p.x);
-		phi += hlsl::mix(scalar_type(0), scalar_type(2) * numbers::pi<scalar_type>, phi < scalar_type(0));
-		return vector2_type(r2, phi * (scalar_type(0.5) * numbers::inv_pi<scalar_type>));
+		phi += hlsl::mix(_static_cast<scalar_type>(0), _static_cast<scalar_type>(2) * numbers::pi<scalar_type>, phi < _static_cast<scalar_type>(0));
+		return vector2_type(r2, phi * (_static_cast<scalar_type>(0.5) * numbers::inv_pi<scalar_type>));
 	}
 
 	static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi<scalar_type>; }
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
index 5bf652cb4c..1ad8b5462e 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl
@@ -25,147 +25,166 @@ namespace sampling
 //   2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL
 //   3. Feed the warped UV into the solid angle sampler to get a rect offset
 //   4. PDF = (1/SolidAngle) * bilinearPdf
-//
-// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight
-// return the PDF instead of the projected-solid-angle MIS weight.
-// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be
-// poor in practice. Once confirmed by testing, remove the false path and stop storing
-// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members.
 template<typename T, bool UsePdfAsWeight = true>
 struct ProjectedSphericalRectangle
 {
-	using scalar_type = T;
-	using vector2_type = vector<T, 2>;
-	using vector3_type = vector<T, 3>;
-	using vector4_type = vector<T, 4>;
-
-	// BackwardTractableSampler concept types
-	using domain_type = vector2_type;
-	using codomain_type = vector3_type;
-	using density_type = scalar_type;
-	using weight_type = density_type;
-
-	struct cache_type
-	{
-		scalar_type abs_cos_theta;
-		vector2_type warped;
-		typename Bilinear<scalar_type>::cache_type bilinearCache;
-	};
-
-	// NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-	// from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-	// at least one vertex has positive projection onto the receiver normal.
-	static ProjectedSphericalRectangle<T,UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
-	{
-		ProjectedSphericalRectangle<T,UsePdfAsWeight> retval;
-		const vector3_type n = hlsl::mul(shape.basis, _receiverNormal);
-		retval.localReceiverNormal = n;
-		retval.receiverWasBSDF = _receiverWasBSDF;
-
-		// Compute solid angle and get r0 in local frame (before z-flip)
-		const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
-		const vector3_type r0 = sa.r0;
-
-		// All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
-		const scalar_type r1x = r0.x + shape.extents.x;
-		const scalar_type r1y = r0.y + shape.extents.y;
-
-		// Unnormalized dots: dot(corner_i, n)
-		const scalar_type base_dot = hlsl::dot(r0, n);
-		const scalar_type dx = shape.extents.x * n.x;
-		const scalar_type dy = shape.extents.y * n.y;
-		const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
-
-		// Squared lengths of each corner
-		const scalar_type r0zSq = r0.z * r0.z;
-		const vector4_type lenSq = vector4_type(
-			r0.x * r0.x + r0.y * r0.y,
-			r1x * r1x + r0.y * r0.y,
-			r0.x * r0.x + r1y * r1y,
-			r1x * r1x + r1y * r1y
-		) + hlsl::promote<vector4_type>(r0zSq);
-
-		// dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq)
-		// Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
-		const scalar_type minimumProjSolidAngle = 0.0;
-		const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF,
-			dots * hlsl::rsqrt<vector4_type>(lenSq),
-			hlsl::promote<vector4_type>(minimumProjSolidAngle));
-		retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
-
-		// Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer)
-		retval.sphrect = SphericalRectangle<T>::create(sa, shape.extents);
-		retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0);
-
-		NBL_IF_CONSTEXPR(!UsePdfAsWeight)
-		{
-			const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n);
-			retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
-		}
-		return retval;
-	}
-
-	// returns a normalized 3D direction in the local frame
-	codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector3_type dir = sphrect.generate(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
-		return dir;
-	}
-
-	// returns a 2D offset on the rectangle surface from the r0 corner
-	vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-	{
-		Bilinear<scalar_type> bilinear = bilinearPatch;
-		cache.warped = bilinear.generate(u, cache.bilinearCache);
-		typename SphericalRectangle<scalar_type>::cache_type sphrectCache;
-		const vector2_type sampleOffset = sphrect.generateSurfaceOffset(cache.warped, sphrectCache);
-		cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
-		return sampleOffset;
-	}
-
-	density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-		return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
-	}
-
-	weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-	{
-		if (UsePdfAsWeight)
-			return forwardPdf(u, cache);
-		return cache.abs_cos_theta * rcpProjSolidAngle;
-	}
-
-	// `p` is the normalized [0,1]^2 position on the rectangle
-	density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC
-	{
-		return rcpSolidAngle * bilinearPatch.backwardPdf(p);
-	}
-
-	weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC
-	{
-		NBL_IF_CONSTEXPR(UsePdfAsWeight)
-			return backwardPdf(p);
-		const scalar_type minimumProjSolidAngle = 0.0;
-		// Reconstruct local direction from normalized rect position
-		const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type(
-			p.x * sphrect.extents.x,
-			p.y * sphrect.extents.y,
-			scalar_type(0)
-		));
-		const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle);
-		return abs_cos_theta * rcpProjSolidAngle;
-	}
-
-	sampling::SphericalRectangle<T> sphrect;
-	Bilinear<scalar_type> bilinearPatch;
-	scalar_type rcpSolidAngle;
-	scalar_type rcpProjSolidAngle;
-	vector3_type localReceiverNormal;
-	bool receiverWasBSDF;
+   using scalar_type  = T;
+   using vector2_type = vector<T, 2>;
+   using vector3_type = vector<T, 3>;
+   using vector4_type = vector<T, 4>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+      typename Bilinear<scalar_type>::cache_type bilinearCache;
+      vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   };
+
+   // Shared finalization for both create() overloads: builds the bilinear patch, the inner sphrect
+   // sampler, and the UsePdfAsWeight=false extras. The two overloads differ only in how they
+   // compute bxdfPdfAtVertex (worldspace corner normalizations vs local-frame rsqrt(lenSq)).
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa,
+      const vector4_type bxdfPdfAtVertex, const vector3_type _receiverNormal)
+   {
+      ProjectedSphericalRectangle<T, UsePdfAsWeight> retval;
+      retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex);
+      // Reuse solid_angle_type to avoid recomputing mul(basis, origin - observer)
+      retval.sphrect = SphericalRectangle<T>::create(shape.basis, sa, shape.extents);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      {
+         retval.receiverNormal     = _receiverNormal;
+         const vector3_type nLocal = hlsl::mul(shape.basis, _receiverNormal);
+         retval.projSolidAngle     = shape.projectedSolidAngleFromLocal(sa.r0, nLocal);
+      }
+      return retval;
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle<T>) compressed, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // 4 normalized worldspace corners dotted with the worldspace receiver normal. Avoids the
+      // mul(basis, receiverNormal) data dependency of the uncompressed overload so these 4
+      // normalize+dot chains can pipeline alongside the basis/solid-angle work below.
+      const vector3_type c0   = compressed.origin - observer;
+      const vector3_type c1   = c0 + compressed.right;
+      const vector3_type c2   = c0 + compressed.up;
+      const vector3_type c3   = c1 + compressed.up;
+      const vector4_type dots = vector4_type(
+         hlsl::dot(hlsl::normalize(c0), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c1), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c2), _receiverNormal),
+         hlsl::dot(hlsl::normalize(c3), _receiverNormal));
+      const scalar_type minimumProjSolidAngle = _static_cast<scalar_type>(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      const shapes::SphericalRectangle<T> shape                         = shapes::SphericalRectangle<T>::create(compressed);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
+   static ProjectedSphericalRectangle<T, UsePdfAsWeight> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
+   {
+      // Local-frame path: unnormalized dot(corner_i, n) with n = basis * receiverNormal, then
+      // a single rsqrt<vec4>(lenSq) for all 4 corner normalizations at once.
+      const vector3_type n                                              = hlsl::mul(shape.basis, _receiverNormal);
+      const typename shapes::SphericalRectangle<T>::solid_angle_type sa = shape.solidAngle(observer);
+      const vector3_type r0                                             = sa.r0;
+
+      // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey
+      const scalar_type r1x      = r0.x + shape.extents.x;
+      const scalar_type r1y      = r0.y + shape.extents.y;
+      const scalar_type base_dot = hlsl::dot(r0, n);
+      const scalar_type dx       = shape.extents.x * n.x;
+      const scalar_type dy       = shape.extents.y * n.y;
+      const vector4_type dots    = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy);
+
+      const scalar_type r0zSq  = r0.z * r0.z;
+      const vector4_type lenSq = vector4_type(
+                                    r0.x * r0.x + r0.y * r0.y,
+                                    r1x * r1x + r0.y * r0.y,
+                                    r0.x * r0.x + r1y * r1y,
+                                    r1x * r1x + r1y * r1y) +
+         hlsl::promote<vector4_type>(r0zSq);
+
+      // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11
+      const scalar_type minimumProjSolidAngle = _static_cast<scalar_type>(0.0);
+      const vector4_type bxdfPdfAtVertex      = math::conditionalAbsOrMax(_receiverWasBSDF,
+         dots * hlsl::rsqrt<vector4_type>(lenSq),
+         hlsl::promote<vector4_type>(minimumProjSolidAngle));
+
+      return create(shape, sa, bxdfPdfAtVertex, _receiverNormal);
+   }
+
+   // returns a normalized 3D direction in the local frame
+   codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+         cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir);
+      return dir;
+   }
+
+   // returns a unnormalized 3D direction in the global frame
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+         cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir));
+      return dir;
+   }
+
+   // returns a normalized 3D direction in the global frame
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+      typename SphericalRectangle<scalar_type>::cache_type sphrectCache; // there's nothing in the cache
+      const vector3_type dir = sphrect.generate(warped, sphrectCache);
+      NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+      cache.L = dir;
+      return dir;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return bilinearPatch.forwardPdf(u, cache.bilinearCache) / sphrect.solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+         return forwardPdf(u, cache);
+      return backwardWeight(cache.L);
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      NBL_IF_CONSTEXPR(UsePdfAsWeight)
+      {
+#if 0
+			const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse`
+			return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle;
+#endif
+         return hlsl::numeric_limits<weight_type>::quiet_NaN;
+      }
+      // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+      return hlsl::abs(hlsl::dot(L, receiverNormal)) / projSolidAngle;
+   }
+
+   sampling::SphericalRectangle<T> sphrect;
+   Bilinear<scalar_type> bilinearPatch;
+   // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false
+   vector3_type receiverNormal;
+   scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
index c4bc5fcea8..6eddd03e8a 100644
--- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl
@@ -47,73 +47,68 @@ struct ProjectedSphericalTriangle
 
     struct cache_type
     {
-        scalar_type abs_cos_theta;
-        vector2_type warped;
         typename Bilinear<scalar_type>::cache_type bilinearCache;
+        vector3_type L; // TODO: erase when UsePdfAsWeight==false
     };
 
-    // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away
-    // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure
-    // at least one vertex has positive projection onto the receiver normal.
+    // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler
     static ProjectedSphericalTriangle<T,UsePdfAsWeight> create(NBL_REF_ARG(shapes::SphericalTriangle<T>) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF)
     {
         ProjectedSphericalTriangle<T,UsePdfAsWeight> retval;
-       retval.sphtri = SphericalTriangle<T, UsePdfAsWeight>::create(shape);
-        retval.receiverNormal = _receiverNormal;
-        retval.receiverWasBSDF = _receiverWasBSDF;
+        retval.sphtri = SphericalTriangle<T>::create(shape);
 
         const scalar_type minimumProjSolidAngle = 0.0;
         matrix<T, 3, 3> m = matrix<T, 3, 3>(shape.vertices[0], shape.vertices[1], shape.vertices[2]);
         const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote<vector3_type>(minimumProjSolidAngle));
         retval.bilinearPatch = Bilinear<scalar_type>::create(bxdfPdfAtVertex.yyxz);
 
-        const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal);
-        retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+        {
+            retval.receiverNormal = _receiverNormal;
+            // prevent division of 0 cosine by 0
+            retval.projSolidAngle = max<scalar_type>(shape.projectedSolidAngle(_receiverNormal),numeric_limits<scalar_type>::min);
+        }
         return retval;
     }
 
     codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
     {
-        Bilinear<scalar_type> bilinear = bilinearPatch;
-        cache.warped = bilinear.generate(u, cache.bilinearCache);
-        typename SphericalTriangle<scalar_type>::cache_type sphtriCache;
-        const vector3_type L = sphtri.generate(cache.warped, sphtriCache);
-        cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache);
+        const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache);
+        typename SphericalTriangle<scalar_type>::cache_type sphtriCache; // PDF is constant caches nothing, its empty
+        const codomain_type L = sphtri.generate(warped, sphtriCache);
+        NBL_IF_CONSTEXPR(!UsePdfAsWeight)
+            cache.L = L;
         return L;
     }
 
     density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache);
+        return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache);
     }
 
     weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
     {
-        if (UsePdfAsWeight)
-            return forwardPdf(u, cache);
-        return cache.abs_cos_theta * rcpProjSolidAngle;
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        const vector2_type u = sphtri.generateInverse(L);
-        return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+            return forwardPdf(u,cache);
+        return backwardWeight(cache.L);
     }
 
     weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
     {
-        NBL_IF_CONSTEXPR(UsePdfAsWeight)
-            return backwardPdf(L);
-        const scalar_type minimumProjSolidAngle = 0.0;
-        const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle);
-        return abs_cos_theta * rcpProjSolidAngle;
+        NBL_IF_CONSTEXPR (UsePdfAsWeight)
+        {
+            const vector2_type u = sphtri.generateInverse(L);
+            return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u);
+        }
+        // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used
+        return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle;
     }
 
-    sampling::SphericalTriangle<T, UsePdfAsWeight> sphtri;
+    sampling::SphericalTriangle<T> sphtri;
     Bilinear<scalar_type> bilinearPatch;
-    scalar_type rcpProjSolidAngle;
+    // TODO: erase when UsePdfAsWeight==false
     vector3_type receiverNormal;
-    bool receiverWasBSDF;
+    scalar_type projSolidAngle;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl
new file mode 100644
index 0000000000..a44b8ff73d
--- /dev/null
+++ b/include/nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl
@@ -0,0 +1,398 @@
+// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_PYRAMID_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_PYRAMID_HLSL_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/shapes/obb.hlsl>
+#include <nbl/builtin/hlsl/shapes/obb_silhouette.hlsl>
+#include <nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl>
+#include <nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace sampling
+{
+
+// Tag-dispatched inner sampler factory: overload selected by the type of the
+// default-constructed `tag` arg. Avoids the per-inner adapter struct.
+inline SphericalRectangle<float32_t> buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, SphericalRectangle<float32_t> /*tag*/)
+{
+   return SphericalRectangle<float32_t>::create(basis, float32_t3(r0, 1.0f), ext);
+}
+
+inline ProjectedSphericalRectangle<float32_t> buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, ProjectedSphericalRectangle<float32_t> /*tag*/)
+{
+   shapes::CompressedSphericalRectangle<float32_t> compressed;
+   compressed.origin = basis[0] * r0.x + basis[1] * r0.y + basis[2];
+   compressed.right  = basis[0] * ext.x;
+   compressed.up     = basis[1] * ext.y;
+   return ProjectedSphericalRectangle<float32_t>::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false);
+}
+
+// Spherical Pyramid: gnomonic bounding rectangle for silhouette sampling.
+//
+// UseCaliper=false: axis1 picks the longest world-space silhouette edge
+//   (one compare per edge, no inner loop, blind to perpendicular spread).
+// UseCaliper=true: spherical rotating-caliper. For each candidate edge (A, B),
+//   the extremal opposing vertex C is found via argmax_K dot(C_K, precross)
+//   where precross = cross(B-A, n0); this matches argmax dot(n0, cross(C+A, C+B))
+//   by the cyclic scalar triple product. Score = cos(dihedral) between the
+//   AB-great-circle and the Lexell-circle plane through (-A, -B, C). The
+//   lune cosine is a heuristic; the post-search bound pass is exact regardless.
+//
+// Pipeline: axis3 = normalize(-unnormCentroid); axis1 = project bestEdge3d
+// onto plane(axis3); axis2 = cross(axis3, axis1); computeBound3D yields
+// (rectR0, rectExtents). axis3 is not stored, reconstructed via getAxis3().
+//
+// rectR0/rectExtents are returned out-params from createFromVertices and not
+// stored on the pyramid (the inner sampler keeps its own copy). The local
+// vertex array dies at end-of-create-scope; only the inner sampler persists.
+template<bool UseCaliper, typename InnerSampler>
+struct SphericalPyramid
+{
+   using scalar_type   = float32_t;
+   using vector2_type  = float32_t2;
+   using vector3_type  = float32_t3;
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   // Caches the inner sampler's cache plus a pre-computed `pdf` that bakes in
+   // the silhouette/horizon validity test from generate().
+   struct cache_type
+   {
+      typename InnerSampler::cache_type inner;
+      density_type                      pdf;
+   };
+
+   float32_t3 axis1;
+   float32_t3 axis2; // axis3 reconstructed via getAxis3() = cross(axis1, axis2)
+
+   // Per-edge cross products in world space. Populated during Pass 1's
+   // centroid accumulation (also cached for caliper scoring), used by
+   // isInside(dir) in generate().
+   shapes::SilEdgeNormals silEdgeNormals;
+
+   // Constructed by create(silhouette, view) via tag-dispatched buildInner.
+   // The synth-vertices path (createFromVertices direct) leaves it default-init.
+   InnerSampler inner;
+
+   float32_t3 getAxis3() NBL_CONST_MEMBER_FUNC { return cross(axis1, axis2); }
+
+   // Pass 1: per-edge cross + Stokes centroid; UseCaliper=false also tracks
+   // the longest world edge here. Out params exist in both modes so the
+   // per-count cascade has one signature; DCE drops the longest-edge body when
+   // UseCaliper=true.
+   template<uint32_t I, uint32_t J>
+   void processEdge(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], NBL_REF_ARG(float32_t3) unnormCentroid, NBL_REF_ARG(float32_t) bestLenSq, NBL_REF_ARG(float32_t3) bestEdge3d, NBL_REF_ARG(uint32_t) bestEdge)
+   {
+      const float32_t3 vI = vertices[I];
+      const float32_t3 vJ = vertices[J];
+
+      const float32_t3 c            = cross(vI, vJ);
+      silEdgeNormals.edgeNormals[I] = c;
+      unnormCentroid += c;
+
+      if (!UseCaliper)
+      {
+         // Explicit nbl::hlsl::select so DXC emits scalar-conditional OpSelect
+         // for the vec3 update instead of a bool-broadcast v3bool.
+         const float32_t3 edge3d = vJ - vI;
+         const float32_t  lenSq  = dot(edge3d, edge3d);
+         const bool       isBest = lenSq > bestLenSq;
+         bestLenSq               = max(lenSq, bestLenSq);
+         bestEdge3d              = nbl::hlsl::select(isBest, edge3d, bestEdge3d);
+         bestEdge                = nbl::hlsl::select(isBest, I, bestEdge);
+      }
+   }
+
+   // Caliper-only helpers (DCE'd when UseCaliper=false).
+
+   // Track the silhouette vertex with max dot(vK, precross). SkipA/SkipB are
+   // the candidate edge's (I, J); compile-time skipped (drops the verts[K]
+   // read entirely). Assumes vertices are ~unit length so we can skip the
+   // per-K |vK| factor in the cosine.
+   template<uint32_t K, uint32_t SkipA, uint32_t SkipB>
+   static void tryK(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], float32_t3 precross, NBL_REF_ARG(float32_t) bestNum, NBL_REF_ARG(float32_t3) bestC)
+   {
+      if (K != SkipA && K != SkipB)
+      {
+         const float32_t3 vK     = vertices[K];
+         const float32_t  num    = dot(vK, precross);
+         const bool       better = num > bestNum;
+         bestNum                 = max(num, bestNum);
+         bestC                   = nbl::hlsl::select(better, vK, bestC);
+      }
+   }
+
+   // Cascade-on-count K scan with (I, J) as compile-time skips. bestNum seeds
+   // at -inf; bestC's placeholder is always overwritten (count >= 3).
+   template<uint32_t I, uint32_t J>
+   static float32_t3 findExtremalC(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, float32_t3 precross)
+   {
+      float32_t  bestNum = -1e30f;
+      float32_t3 bestC   = vertices[0];
+      tryK<0, I, J>(vertices, precross, bestNum, bestC);
+      tryK<1, I, J>(vertices, precross, bestNum, bestC);
+      tryK<2, I, J>(vertices, precross, bestNum, bestC);
+      if (count > 3)
+      {
+         tryK<3, I, J>(vertices, precross, bestNum, bestC);
+         if (count > 4)
+         {
+            tryK<4, I, J>(vertices, precross, bestNum, bestC);
+            if (count > 5)
+            {
+               tryK<5, I, J>(vertices, precross, bestNum, bestC);
+               if (count > 6)
+                  tryK<6, I, J>(vertices, precross, bestNum, bestC);
+            }
+         }
+      }
+      return bestC;
+   }
+
+   // Score candidate edge (I, J) by cos(dihedral) between AB-great-circle
+   // and Lexell plane through (-A, -B, C_win). Identity used:
+   //   cross(C+A, C+B) = n0 + cross(A, C) + cross(C, B)
+   // so we reuse cached n0. Larger score = smaller bounding lune. max(.,1e-30f)
+   // keeps rsqrt finite on collapsed edges (they lose on numerator anyway).
+   template<uint32_t I, uint32_t J>
+   static void evalCandidate(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, NBL_CONST_REF_ARG(shapes::SilEdgeNormals) sen, NBL_REF_ARG(float32_t) bestScore, NBL_REF_ARG(float32_t3) bestEdge3d, NBL_REF_ARG(uint32_t) bestEdge)
+   {
+      const float32_t3 vI     = vertices[I];
+      const float32_t3 vJ     = vertices[J];
+      const float32_t3 n0     = sen.edgeNormals[I];
+      const float32_t3 edge3d = vJ - vI;
+
+      const float32_t3 precross = cross(edge3d, n0);
+      const float32_t3 C        = findExtremalC<I, J>(vertices, count, precross);
+
+      const float32_t3 lexell_n1   = n0 + cross(vI, C) + cross(C, vJ);
+      const float32_t  numerator   = dot(n0, lexell_n1);
+      const float32_t  edgeDenomSq = dot(n0, n0) * dot(lexell_n1, lexell_n1);
+      const float32_t  score       = numerator * rsqrt(max(edgeDenomSq, 1e-30f));
+
+      const bool better = score > bestScore;
+      bestScore         = max(score, bestScore);
+      bestEdge3d        = nbl::hlsl::select(better, edge3d, bestEdge3d);
+      bestEdge          = nbl::hlsl::select(better, I, bestEdge);
+   }
+
+   // Gnomonic-project each silhouette vertex into the (axis1, axis2, axis3)
+   // frame and accumulate the AABB.
+   template<uint32_t I>
+   static void boundOne3D(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], float32_t3 axis1, float32_t3 perp, float32_t3 axis3, NBL_REF_ARG(float32_t4) bound)
+   {
+      const float32_t3 vert  = vertices[I];
+      const float32_t  rcpDp = rcp(dot(vert, axis3));
+      const float32_t  x     = dot(vert, axis1) * rcpDp;
+      const float32_t  y     = dot(vert, perp) * rcpDp;
+      bound.x                = min(bound.x, x);
+      bound.y                = min(bound.y, y);
+      bound.z                = max(bound.z, x);
+      bound.w                = max(bound.w, y);
+   }
+
+   static void computeBound3D(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, float32_t3 axis1, float32_t3 perp, float32_t3 axis3, NBL_REF_ARG(float32_t4) bound)
+   {
+      bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f);
+      boundOne3D<0>(vertices, axis1, perp, axis3, bound);
+      boundOne3D<1>(vertices, axis1, perp, axis3, bound);
+      boundOne3D<2>(vertices, axis1, perp, axis3, bound);
+      if (count > 3)
+      {
+         boundOne3D<3>(vertices, axis1, perp, axis3, bound);
+         if (count > 4)
+         {
+            boundOne3D<4>(vertices, axis1, perp, axis3, bound);
+            if (count > 5)
+            {
+               boundOne3D<5>(vertices, axis1, perp, axis3, bound);
+               if (count > 6)
+                  boundOne3D<6>(vertices, axis1, perp, axis3, bound);
+            }
+         }
+      }
+   }
+
+   // Pyramid from pre-materialized verts; (rectR0, rectExtents) returned as
+   // out-params (not stored on the pyramid).
+   static SphericalPyramid<UseCaliper, InnerSampler> createFromVertices(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, NBL_REF_ARG(float32_t2) outRectR0, NBL_REF_ARG(float32_t2) outRectExtents)
+   {
+      SphericalPyramid<UseCaliper, InnerSampler> self;
+      // Sentinel-init so unused slots (count..6) produce dot(dir,(0,0,-1)) < 0
+      // for the sign-bit AND in shapes::SilEdgeNormals::isInside.
+      self.silEdgeNormals = shapes::SilEdgeNormals::initSentinel();
+
+      // Tiny z-bias seed so symmetric shapes don't normalize(0) to NaN; the
+      // cross sum dominates for any non-degenerate silhouette.
+      // verts past count are zero-init by materialize, so reading them is harmless.
+      float32_t3 unnormCentroid = float32_t3(0.0f, 0.0f, 1e-6f);
+      float32_t  bestLenSq      = 0.0f;
+      float32_t3 bestEdge3d     = float32_t3(1.0f, 0.0f, 0.0f);
+      uint32_t   bestEdge       = 0;
+
+      self.processEdge<0, 1>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+      self.processEdge<1, 2>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+      if (count == 3)
+      {
+         self.processEdge<2, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+      }
+      else
+      {
+         self.processEdge<2, 3>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+         if (count == 4)
+         {
+            self.processEdge<3, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+         }
+         else
+         {
+            self.processEdge<3, 4>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+            if (count == 5)
+            {
+               self.processEdge<4, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+            }
+            else
+            {
+               self.processEdge<4, 5>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+               if (count == 6)
+               {
+                  self.processEdge<5, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+               }
+               else // count == 7
+               {
+                  self.processEdge<5, 6>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+                  self.processEdge<6, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge);
+               }
+            }
+         }
+      }
+
+      const float32_t3 axis3 = normalize(-unnormCentroid);
+
+      // Pass 2: caliper dihedral scan overwrites bestEdge3d. Skipped under
+      // UseCaliper=false (keeps Pass 1's longest edge).
+      if (UseCaliper)
+      {
+         float32_t bestScore = -2.0f;
+
+         evalCandidate<0, 1>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+         evalCandidate<1, 2>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+         if (count == 3)
+         {
+            evalCandidate<2, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+         }
+         else
+         {
+            evalCandidate<2, 3>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+            if (count == 4)
+            {
+               evalCandidate<3, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+            }
+            else
+            {
+               evalCandidate<3, 4>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+               if (count == 5)
+               {
+                  evalCandidate<4, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+               }
+               else
+               {
+                  evalCandidate<4, 5>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+                  if (count == 6)
+                  {
+                     evalCandidate<5, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+                  }
+                  else // count == 7
+                  {
+                     evalCandidate<5, 6>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+                     evalCandidate<6, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge);
+                  }
+               }
+            }
+         }
+      }
+
+      // axis1 = winning chord projected onto plane(axis3) and normalized.
+      // max(lenSq, 1e-12) keeps rsqrt finite; degenerate select picks a stable
+      // axis perpendicular to axis3.
+      const float32_t3 inPlaneEdge  = bestEdge3d - axis3 * dot(bestEdge3d, axis3);
+      const float32_t  inPlaneLenSq = dot(inPlaneEdge, inPlaneEdge);
+      const bool       useY         = abs(axis3.x) >= 0.9f;
+      const float32_t  scale        = rsqrt(max(inPlaneLenSq, 1e-12f));
+
+      const bool       degenerate    = inPlaneLenSq <= 1e-12f;
+      const float32_t3 fallbackAxis1 = nbl::hlsl::select(useY, float32_t3(0.0f, 1.0f, 0.0f), float32_t3(1.0f, 0.0f, 0.0f));
+      self.axis1                     = nbl::hlsl::select(degenerate, fallbackAxis1, inPlaneEdge * scale);
+      self.axis2                     = cross(axis3, self.axis1);
+
+      float32_t4 bestBound;
+      computeBound3D(vertices, count, self.axis1, self.axis2, axis3, bestBound);
+
+      // Per-axis degenerate clamp: each upper bound at least 1e-6 above lower.
+      // Independent per axis so a single collapsed axis doesn't kill the other.
+      bestBound.zw = max(bestBound.zw, bestBound.xy + 1e-6f);
+
+      outRectR0      = bestBound.xy;
+      outRectExtents = float32_t2(bestBound.zw - bestBound.xy);
+
+      // Pre-rotate edge normals into local frame so per-sample inside test
+      // can use the cheaper 2D form (2 muls + 2 adds + n.z per edge instead
+      // of 3 muls + 2 adds). Amortized once per build; saves 7 muls/sample.
+      self.silEdgeNormals.transformToLocal(self.axis1, self.axis2, axis3);
+
+      return self;
+   }
+
+   // Materialize verts (in shading-point-relative coords baked into silhouette)
+   // from the silhouette, build the pyramid, then construct the InnerSampler
+   // via tag-dispatched buildInner. Local rect data dies at end-of-scope; only
+   // the inner sampler retains a copy.
+   static SphericalPyramid<UseCaliper, InnerSampler> create(NBL_CONST_REF_ARG(shapes::ClippedSilhouette) silhouette, shapes::OBBView<float32_t> view)
+   {
+      float32_t3 vertices[shapes::MaxOBBSilhouetteVertices];
+      silhouette.materialize(view, vertices);
+
+      float32_t2 rectR0, rectExtents;
+      SphericalPyramid<UseCaliper, InnerSampler> self = createFromVertices(vertices, silhouette.count, rectR0, rectExtents);
+
+      // tag's value is unread; only its type selects the overload.
+      const float32_t3x3 basis = float32_t3x3(self.axis1, self.axis2, self.getAxis3());
+      InnerSampler tag;
+      self.inner = buildInner(basis, rectR0, rectExtents, tag);
+      return self;
+   }
+
+   // Generate via inner.generateNormalizedLocal so we can recover gnomonic
+   // (localX, localY) for the 2D inside test. With rectR0.z == 1, localDir.z =
+   // 1/hitDist, so localDir.{x,y} * hitDist == gnomonic coords. Bake
+   // silhouette/horizon validity into cache.pdf so forwardPdf is O(1).
+   codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache)
+   {
+      scalar_type          hitDist;
+      const codomain_type  localDir = inner.generateNormalizedLocal(u, cache.inner, hitDist);
+      const codomain_type  dir      = localDir.x * axis1 + localDir.y * axis2 + localDir.z * getAxis3();
+      const scalar_type    localX   = localDir.x * hitDist;
+      const scalar_type    localY   = localDir.y * hitDist;
+      const bool           valid    = dir.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY);
+      cache.pdf                     = hlsl::select(valid, inner.forwardPdf(u, cache.inner), 0.0f);
+      return dir;
+   }
+
+   density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; }
+   weight_type  forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; }
+   uint32_t     selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0u; }
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
index 131cc92d70..e3cbb70998 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl
@@ -19,156 +19,214 @@ namespace hlsl
 namespace sampling
 {
 
-template<typename T>
+// UseRealSinCos=true  (default): math::sincos uses real sin+cos; full precision near au=n*pi, Jacobian test passes.
+// UseRealSinCos=false          : math::sincos uses cos + sqrt(1-c*c); saves one special-function op but loses mantissa as |cos_au| -> 1.
+template<typename T, bool UseRealSinCos = true>
 struct SphericalRectangle
 {
-    using scalar_type = T;
-    using vector2_type = vector<T, 2>;
-    using vector3_type = vector<T, 3>;
-    using vector4_type = vector<T, 4>;
-
-    // BackwardTractableSampler concept types
-    using domain_type = vector2_type;
-    using codomain_type = vector3_type;
-    using density_type = scalar_type;
-    using weight_type = density_type;
-
-    struct cache_type {};
-
-    NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5;
-
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
-    {
-        return create(rect.solidAngle(observer), rect.extents);
-    }
-
-    static SphericalRectangle<T> create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
-    {
-        SphericalRectangle<T> retval;
-        retval.r0 = sa.r0;
-        retval.extents = _extents;
-
-        retval.solidAngle = sa.value;
-        retval.b0 = sa.n_z[0];
-        retval.b1 = sa.n_z[2];
-
-        math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
-        angle_adder.addCosine(sa.cosGamma[3]);
-        retval.k = scalar_type(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
-
-        return retval;
-    }
-
-    // Create directly from a local-frame corner position and rectangle extents.
-    // Use when you already know r0 (e.g. from a gnomonic projection) and don't
-    // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
-    static SphericalRectangle<T> create(const vector3_type _r0, const vector2_type _extents)
-    {
-        // Same math as shapes::SphericalRectangle::solidAngle() but without
-        // the mul(basis, origin - observer) step since we already have r0.
-        typename shapes::SphericalRectangle<T>::solid_angle_type sa;
-        sa.r0 = _r0;
-
-        const scalar_type zSq = _r0.z * _r0.z;
-        const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
-        sa.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
-        sa.cosGamma = vector4_type(
-            -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
-            -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
-
-        math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
-        acc.addCosine(sa.cosGamma[1]);
-        acc.addCosine(sa.cosGamma[2]);
-        acc.addCosine(sa.cosGamma[3]);
-        sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi<scalar_type>;
-
-        return create(sa, _extents);
-    }
-
-    // shared core of generate and generateSurfaceOffset
-    // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
-    vector3_type __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
-    {
-        // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
-        const scalar_type negAbsR0z = -hlsl::abs(r0.z);
-        const scalar_type r0zSq = r0.z * r0.z;
-        const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y);
-
-        const scalar_type au = u.x * solidAngle + k;
-        const scalar_type cos_au = hlsl::cos<scalar_type>(au);
-        const scalar_type numerator = b1 - cos_au * b0;
-        // (1-cos)*(1+cos) avoids catastrophic cancellation of 1-cos^2 when cos_au is near +/-1
-        const scalar_type sin_au_sq = (scalar_type(1.0) - cos_au) * (scalar_type(1.0) + cos_au);
-        const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt<scalar_type>(sin_au_sq);
-        const scalar_type rcpCu_2 = hlsl::max<scalar_type>(absNegFu * absNegFu + b0 * b0, scalar_type(1.0));
-        // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI
-        const scalar_type negFuSign = hlsl::select((au > numbers::pi<scalar_type>) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0));
-        scalar_type xu = negAbsR0z * negFuSign * hlsl::rsqrt<scalar_type>(rcpCu_2 - scalar_type(1.0));
-        xu = hlsl::clamp<scalar_type>(xu, r0.x, r1.x); // avoid Infs
-        const scalar_type d_2 = xu * xu + r0zSq;
-        const scalar_type d = hlsl::sqrt<scalar_type>(d_2);
-
-        const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(d_2 + r0.y * r0.y);
-        const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(d_2 + r1.y * r1.y);
-        const scalar_type hv = h0 + u.y * (h1 - h0);
-
-        return vector3_type(xu, hv, d);
-    }
-
-    // returns a normalized 3D direction in the local frame with correct r0.z sign
-    codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type cosElevation = hlsl::sqrt<scalar_type>(hlsl::max<scalar_type>(scalar_type(1.0) - hv2, scalar_type(0.0)));
-        const scalar_type rcpD = scalar_type(1.0) / d;
-
-        return vector3_type(xu * cosElevation * rcpD, hv, r0.z * cosElevation * rcpD);
-    }
-
-    // returns a 2D offset on the rectangle surface from the r0 corner
-    vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-    {
-        const vector3_type core = __generate(u);
-        const scalar_type xu = core.x;
-        const scalar_type hv = core.y;
-        const scalar_type d = core.z;
-        const scalar_type r1y = r0.y + extents.y;
-        const scalar_type hv2 = hv * hv;
-        const scalar_type yv = hlsl::mix(r1y, (hv * d) / hlsl::sqrt<scalar_type>(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps);
-
-        return vector2_type((xu - r0.x), (yv - r0.y));
-    }
-
-    density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-    {
-        return forwardPdf(u, cache);
-    }
-
-    density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return scalar_type(1.0) / solidAngle;
-    }
-
-    weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-    {
-        return backwardPdf(L);
-    }
-
-    scalar_type solidAngle;
-    scalar_type k;
-    scalar_type b0;
-    scalar_type b1;
-    vector3_type r0;
-    vector2_type extents;
+   using scalar_type    = T;
+   using vector2_type   = vector<T, 2>;
+   using vector3_type   = vector<T, 3>;
+   using vector4_type   = vector<T, 4>;
+   using matrix3x3_type = matrix<T, 3, 3>;
+
+   // BackwardTractableSampler concept types
+   using domain_type   = vector2_type;
+   using codomain_type = vector3_type;
+   using density_type  = scalar_type;
+   using weight_type   = density_type;
+
+   struct cache_type
+   {
+   };
+
+   static SphericalRectangle<T, UseRealSinCos> create(NBL_CONST_REF_ARG(shapes::SphericalRectangle<T>) rect, const vector3_type observer)
+   {
+      return create(rect.basis, rect.solidAngle(observer), rect.extents);
+   }
+
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle<T>::solid_angle_type) sa, const vector2_type _extents)
+   {
+      SphericalRectangle<T, UseRealSinCos> retval;
+      retval.basis   = _basis;
+      retval.r0      = sa.r0;
+      retval.extents = _extents;
+
+      retval.solidAngle = sa.value;
+      retval.b0         = sa.n_z[0];
+      retval.b1         = sa.n_z[2];
+
+      math::sincos_accumulator<scalar_type> angle_adder = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[2]);
+      angle_adder.addCosine(sa.cosGamma[3]);
+      retval.k = _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type> - angle_adder.getSumOfArccos();
+
+      return retval;
+   }
+
+   // Create directly from a local-frame corner position and rectangle extents.
+   // Use when you already know r0 (e.g. from a gnomonic projection) and don't
+   // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip.
+   static SphericalRectangle<T, UseRealSinCos> create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents)
+   {
+      // Same math as shapes::SphericalRectangle::solidAngle() but without
+      // the mul(basis, origin - observer) step since we already have r0.
+      typename shapes::SphericalRectangle<T>::solid_angle_type sa;
+      sa.r0 = _r0;
+
+      const scalar_type zSq         = _r0.z * _r0.z;
+      const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x);
+      sa.n_z                        = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(zSq) + denorm_n_z * denorm_n_z);
+      sa.cosGamma                   = vector4_type(
+         -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2],
+         -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]);
+
+      math::sincos_accumulator<scalar_type> acc = math::sincos_accumulator<scalar_type>::create(sa.cosGamma[0]);
+      acc.addCosine(sa.cosGamma[1]);
+      acc.addCosine(sa.cosGamma[2]);
+      acc.addCosine(sa.cosGamma[3]);
+      sa.value = acc.getSumOfArccos() - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>;
+
+      return create(_basis, sa, _extents);
+   }
+
+   // shared core of generate and generateSurfaceOffset
+   // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction
+   struct SCommonGen
+   {
+      scalar_type xu;
+      scalar_type d2;
+      scalar_type hv;
+      scalar_type cosElevation2;
+   };
+   SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC
+   {
+      SCommonGen retval;
+
+      // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip
+      const scalar_type negAbsR0z = -hlsl::abs(r0.z);
+      const scalar_type r0zSq     = r0.z * r0.z;
+      const vector2_type r1       = vector2_type(r0.x + extents.x, r0.y + extents.y);
+
+      // au in [0, 4*pi] since u.x in [0,1], solidAngle <= 2*pi, k <= 2*pi.
+      // The sqrt path in math::sincos recovers sin sign via sign(theta), which only holds for theta in [-pi,pi].
+      // Real sin/cos handle any range, so only wrap on the sqrt path (compile-time branch folds away).
+      scalar_type au = u.x * solidAngle + k;
+      NBL_IF_CONSTEXPR(!UseRealSinCos)
+      {
+         // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi].
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>, au);
+         au = hlsl::select(au > numbers::pi<scalar_type>, au - _static_cast<scalar_type>(2.0) * numbers::pi<scalar_type>, au);
+      }
+      scalar_type sin_au, cos_au;
+      math::sincos<scalar_type, UseRealSinCos>(au, sin_au, cos_au);
+      const scalar_type numerator = b1 - cos_au * b0;
+      // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops
+      // out of a single multiply + hlsl::sign.
+      const scalar_type negFu   = numerator / sin_au;
+      const scalar_type rcpCu_2 = hlsl::max<scalar_type>(negFu * negFu + b0 * b0, _static_cast<scalar_type>(1.0));
+      retval.xu                 = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt<scalar_type>(rcpCu_2 - _static_cast<scalar_type>(1.0));
+      retval.xu = hlsl::clamp<scalar_type>(retval.xu, r0.x, r1.x); // avoid Infs
+      retval.d2 = retval.xu * retval.xu + r0zSq;
+
+      const scalar_type h0 = r0.y * hlsl::rsqrt<scalar_type>(retval.d2 + r0.y * r0.y);
+      const scalar_type h1 = r1.y * hlsl::rsqrt<scalar_type>(retval.d2 + r1.y * r1.y);
+      retval.hv            = h0 + u.y * (h1 - h0);
+      retval.cosElevation2 = _static_cast<scalar_type>(1.0) - hlsl::min<scalar_type>(retval.hv * retval.hv, 1);
+
+      return retval;
+   }
+
+   // returns a normalized 3D direction in the local frame with correct r0.z sign
+   vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core         = __generate(u);
+      scalar_type cosElevationOverD = hlsl::rsqrt(core.d2 / core.cosElevation2);
+      // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use?
+      cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD), 1.f, cosElevationOverD);
+
+      // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2`
+      // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2`
+      hitDist = 1.f / cosElevationOverD;
+
+      // x/hitDist == x * cosElevationOverD; saves two divs vs the obvious form.
+      const vector3_type retval = vector3_type(core.xu * cosElevationOverD, core.hv, r0.z * cosElevationOverD);
+      assert(!hlsl::isnan(computeHitT(retval)));
+      return retval;
+   }
+
+   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      scalar_type dummy;
+      const vector3_type localL = generateNormalizedLocal(u, cache, dummy);
+      // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this
+      return basis[0] * localL[0] + basis[1] * localL[1] + basis[2] * localL[2];
+   }
+
+   // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle
+   scalar_type computeHitT(const vector3_type L) NBL_CONST_MEMBER_FUNC
+   {
+      const scalar_type retval = hlsl::abs(r0.z / hlsl::dot(L, basis[2]));
+      {
+         const vector3_type hitPointRelative = L * retval;
+         const vector2_type uv               = mul(basis, hitPointRelative).xy - r0.xy;
+         const vector2_type tol              = hlsl::max<vector2_type>(hlsl::abs(extents), hlsl::promote<vector2_type>(_static_cast<scalar_type>(1.0))) * _static_cast<scalar_type>(1e-5);
+         assert(uv[0] >= -tol[0] && uv[0] <= extents[0] + tol[0]);
+         assert(uv[1] >= -tol[1] && uv[1] <= extents[1] + tol[1]);
+      }
+      return retval;
+   }
+
+   // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L
+   vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const SCommonGen core = __generate(u);
+      const scalar_type r1y = r0.y + extents.y;
+      // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu`  and `cosElevation2`
+      const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2 / core.d2);
+
+      // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one
+      // TODO: maybe try just `min(yv,ry1)`
+      const vector2_type retval = vector2_type(core.xu, hlsl::clamp(yv, r0.y, r1y));
+      assert(retval[0] >= r0.x && retval[1] >= r0.y);
+      assert(retval[0] <= r0.x + extents[0] && retval[1] <= r0.y + extents[1]);
+      return retval;
+   }
+
+   // its basically the hitpoint minus the observer origin
+   codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
+   {
+      const vector2_type localXY = generateLocalBasisXY(u, cache);
+      // the `localXY` already contains r0.xy
+      return basis[0] * localXY[0] + basis[1] * localXY[1] + basis[2] * r0.z;
+   }
+
+   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return _static_cast<scalar_type>(1.0) / solidAngle;
+   }
+
+   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
+   {
+      return forwardPdf(u, cache);
+   }
+
+   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return _static_cast<scalar_type>(1.0) / solidAngle;
+   }
+
+   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
+   {
+      return backwardPdf(L);
+   }
+
+   matrix3x3_type basis;
+   vector3_type r0;
+   vector2_type extents;
+   scalar_type solidAngle;
+   scalar_type k;
+   scalar_type b0;
+   scalar_type b1;
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
index 6f29582e04..b915a55de7 100644
--- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl
@@ -11,6 +11,7 @@
 #include <nbl/builtin/hlsl/math/quaternions.hlsl>
 #include <nbl/builtin/hlsl/math/fast_acos.hlsl>
 #include <nbl/builtin/hlsl/shapes/spherical_triangle.hlsl>
+#include <nbl/builtin/hlsl/ieee754.hlsl>
 
 namespace nbl
 {
@@ -25,9 +26,6 @@ enum SphericalTriangleAlgorithm : uint16_t
    STA_PBRT = 1
 };
 
-template<typename T, bool Bijective = false, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
-struct SphericalTriangle;
-
 namespace impl
 {
 
@@ -53,9 +51,8 @@ T sumOfProducts(T a, T b, T c, T d)
 
 } // namespace impl
 
-// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse)
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, false, Algorithm>
+template<typename T, SphericalTriangleAlgorithm Algorithm = STA_ARVO>
+struct SphericalTriangle
 {
    using scalar_type = T;
    using vector2_type = vector<T, 2>;
@@ -73,22 +70,23 @@ struct SphericalTriangle<T, false, Algorithm>
    static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
    {
       SphericalTriangle retval;
-      retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle;
+      retval.rcpSolidAngle = _static_cast<scalar_type>(1.0) / tri.solid_angle;
       retval.tri_vertices[0] = tri.vertices[0];
       retval.tri_vertices[1] = tri.vertices[1];
-      retval.triCosC = tri.cos_sides[2];
+      retval.triCosc = tri.cos_sides[2];
       // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b),
       // so multiplying by csc(b) normalizes it; zero when side AC is degenerate
-      const scalar_type cscB = tri.csc_sides[1];
-      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
+      const scalar_type cscb = tri.csc_sides[1];
+      const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits<scalar_type>::max, cscb, _static_cast<scalar_type>(0));
       retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]);
       retval.cosA = tri.cos_vertices[0];
       retval.sinA = tri.sin_vertices[0];
       if (Algorithm == STA_ARVO)
       {
-         retval.sinA_triCosC = retval.sinA * retval.triCosC;
+         retval.sinA_triCosc = retval.sinA * retval.triCosc;
          retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]);
       }
+      retval.APlusC = tri.vertices[0] + tri.vertices[2];
       return retval;
    }
 
@@ -106,36 +104,37 @@ struct SphericalTriangle<T, false, Algorithm>
       if (Algorithm == STA_ARVO) // faster than PBRT
       {
          const scalar_type u_ = t - cosA;
-         const scalar_type v_ = s + sinA_triCosC;
+         const scalar_type v_ = s + sinA_triCosc;
          const scalar_type num = (v_ * t - u_ * s) * cosA - v_;
          const scalar_type denum = (v_ * s + u_ * t) * sinA;
 
+#define ACCURATE 1
 #if ACCURATE
          // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x).
          // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum|
-         // where denum-num = sinA*(1+triCosC)*(1-cosA_hat).
+         // where denum-num = sinA*(1+triCosc)*(1-cosA_hat).
 
          // For large triangles with high u.x, cosA_hat can approach -1,
          // making (1 + cosA_hat) near zero and the division unstable.
          // Use the algebraic identity only when cosA_hat > 0 (safe denominator).
-         const scalar_type rcpDenum = scalar_type(1) / denum;
-         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat);
-         const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat;
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
-         cosBp = scalar_type(1) - DminusN * rcpDenum;
+         const scalar_type rcpDenum = _static_cast<scalar_type>(1) / denum;
+         const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > _static_cast<scalar_type>(0), sinA_hat * sinA_hat / (_static_cast<scalar_type>(1) + cosA_hat), _static_cast<scalar_type>(1) - cosA_hat);
+         const scalar_type DminusN = sinA * (_static_cast<scalar_type>(1) + triCosc) * oneMinusCosAhat;
+         sinBp = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum);
+         cosBp = _static_cast<scalar_type>(1) - DminusN * rcpDenum;
 #else // 17% faster, less accurate
          cosBp = num / denum;
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         sinBp = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), _static_cast<scalar_type>(1) - cosBp * cosBp));
 #endif
       }
       else // STA_PBRT, accurate, slowest
       {
          // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat)
          const scalar_type k1 = -t + cosA;
-         const scalar_type k2 = -s - sinA * triCosC;
+         const scalar_type k2 = -s - sinA * triCosc;
          cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA);
-         cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1));
-         sinBp = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - cosBp * cosBp));
+         cosBp = nbl::hlsl::clamp(cosBp, _static_cast<scalar_type>(-1), _static_cast<scalar_type>(1));
+         sinBp = sqrt<scalar_type>(_static_cast<scalar_type>(1) - cosBp * cosBp);
       }
 
       // Step 3: construct C' on the great circle through A toward C
@@ -143,81 +142,24 @@ struct SphericalTriangle<T, false, Algorithm>
 
       // Step 4: uniformly sample the great circle arc from B to C'
       scalar_type cosCpB;
-      if (Algorithm == STA_ARVO)
-         cosCpB = cosBp * triCosC + sinBp * eCdotB;
+      NBL_IF_CONSTEXPR(Algorithm == STA_ARVO)
+         cosCpB = cosBp * triCosc + sinBp * eCdotB;
       else
          cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]);
-      const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB);
-      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(scalar_type(0), scalar_type(1) - z * z));
+      // TODO: degeneracy at u.y = 0. z = 1 - u.y*(1-cosCpB) makes sinZ = sqrt(1-z^2) behave like
+      // sqrt(u.y) near zero, so dL/du.y diverges as u.y^(-1/2) and every higher derivative diverges
+      // faster. The forward Jacobian test in 37_HLSLSamplingTests reports ~2-8% error at u.y < 0.003
+      // even with the O(h^2) one-sided stencil because the third-derivative term dominates. At
+      // u.y = 0 exactly, L collapses to vertex B for all u.x (|det J| = 0), so it's an intrinsic
+      // property of the Arvo parameterization, not a bug. Fix: rework the arc interpolation to use
+      // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle
+      // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in
+      // the tester can be removed.
+      const scalar_type z = _static_cast<scalar_type>(1) - u.y * (_static_cast<scalar_type>(1) - cosCpB);
+      const scalar_type sinZ = sqrt<scalar_type>(max<scalar_type>(_static_cast<scalar_type>(0), _static_cast<scalar_type>(1) - z * z));
       return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]);
    }
 
-   density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC
-   {
-      return forwardPdf(u, cache);
-   }
-
-   density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return rcpSolidAngle;
-   }
-
-   weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC
-   {
-      return backwardPdf(L);
-   }
-
-   scalar_type rcpSolidAngle;
-   scalar_type cosA;
-   scalar_type sinA;
-   scalar_type sinA_triCosC; // precomputed sinA * triCosC
-   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
-
-   vector3_type tri_vertices[2]; // A and B only
-   scalar_type triCosC;
-   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
-};
-
-// Bijective: adds generateInverse, stores extra members for the inverse mapping
-template<typename T, SphericalTriangleAlgorithm Algorithm>
-struct SphericalTriangle<T, true, Algorithm>
-{
-   using scalar_type = T;
-   using vector2_type = vector<T, 2>;
-   using vector3_type = vector<T, 3>;
-
-   using base_type = SphericalTriangle<T, false, Algorithm>;
-   using domain_type = vector2_type;
-   using codomain_type = vector3_type;
-   using density_type = scalar_type;
-   using weight_type = density_type;
-
-   using cache_type = typename base_type::cache_type;
-
-   static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle<T>) tri)
-   {
-      SphericalTriangle retval;
-      retval.base = base_type::create(tri);
-      retval.rcpSolidAngle = retval.base.rcpSolidAngle;
-      retval.vertexC = tri.vertices[2];
-      // precompute great circle normal of arc AC (needed for generateInverse)
-      const scalar_type cscB = tri.csc_sides[1];
-      retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits<scalar_type>::max, cscB, scalar_type(0));
-      retval.triCscC = tri.csc_sides[2];
-      return retval;
-   }
-
-   codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC
-   {
-      return base.generate(u, cache);
-   }
-
-
    // generate() works in two steps:
    //   u.x -> pick C' on arc AC (choosing a sub-area fraction)
    //   u.y -> pick L on arc B->C' (linear interpolation)
@@ -229,37 +171,34 @@ struct SphericalTriangle<T, true, Algorithm>
    domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC
    {
       // Step 1: find C' = intersection of great circles (B,L) and (A,C)
-      const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L);
+      const vector3_type BxL = nbl::hlsl::cross(tri_vertices[1], L);
       const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL);
-      if (sinBL_sq < numeric_limits<scalar_type>::epsilon)
+
+      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
+      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
+      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
+      const scalar_type tripleA = nbl::hlsl::dot(BxL, tri_vertices[0]);
+      const scalar_type tripleE = nbl::hlsl::dot(BxL, e_C);
+      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
+
+      if (sinBL_sq < numeric_limits<scalar_type>::epsilon || R_sq < numeric_limits<scalar_type>::epsilon)
       {
-         // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0).
          // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio
          // cancels so any C' gives the same result).
-         const vector3_type LminusB = L - base.tri_vertices[1];
-         const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1];
+         const vector3_type LminusB = L - tri_vertices[1];
+         const vector3_type AminusB = tri_vertices[0] - tri_vertices[1];
          const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
          const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB);
          const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
-            nbl::hlsl::clamp(v_num / v_denom, scalar_type(0.0), scalar_type(1.0)),
-            scalar_type(0.0));
-         return vector2_type(scalar_type(0.0), v);
+            nbl::hlsl::clamp(v_num / v_denom, _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0)),
+            _static_cast<scalar_type>(0.0));
+         return vector2_type(_static_cast<scalar_type>(0.0), v);
       }
 
-      // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t).
-      // C' also lies on the B-L plane, so dot(BxL, C') = 0.
-      // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R
-      const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]);
-      const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C);
-      const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE;
-      if (R_sq < numeric_limits<scalar_type>::epsilon)
-         return vector2_type(scalar_type(0.0), scalar_type(0.0));
-
-      const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq);
-      vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR);
-      // two intersections exist; pick the one on the minor arc A->C
-      if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0))
-         cp = -cp;
+      const scalar_type rcpR = _static_cast<scalar_type>(1.0) / nbl::hlsl::sqrt(R_sq);
+      vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR);
+      // two intersections exist; pick the one on the minor arc A->C (branchless sign flip)
+      cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC));
 
       // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C)
       // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A)
@@ -270,25 +209,25 @@ struct SphericalTriangle<T, true, Algorithm>
       // Expanding C' = cosBp*A + sinBp*e_C into the triple product:
       //   A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C)
       // since A.(BxA) = 0 identically. This avoids the cancellation.
-      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]);
-      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C);
-      const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C));
+      const scalar_type cosBp_inv = nbl::hlsl::dot(cp, tri_vertices[0]);
+      const scalar_type sinBp_inv = nbl::hlsl::dot(cp, e_C);
+      const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C));
       const scalar_type num = sinBp_inv * AxBdotE;
-      const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp);
-      const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv;
-      const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
-      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0));
+      const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp);
+      const scalar_type den = _static_cast<scalar_type>(1.0) + triCosc + cosCpB + cosBp_inv;
+      const scalar_type subSolidAngle = _static_cast<scalar_type>(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den);
+      const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0));
 
       // Step 3: u.y = |L-B|^2 / |C'-B|^2
       // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot)
-      const vector3_type LminusB = L - base.tri_vertices[1];
-      const vector3_type cpMinusB = cp - base.tri_vertices[1];
+      const vector3_type LminusB = L - tri_vertices[1];
+      const vector3_type cpMinusB = cp - tri_vertices[1];
       const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB);
       const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB);
       const scalar_type v = hlsl::select(v_denom > numeric_limits<scalar_type>::epsilon,
          nbl::hlsl::clamp(v_num / nbl::hlsl::max(v_denom, numeric_limits<scalar_type>::min),
-            scalar_type(0.0), scalar_type(1.0)),
-         scalar_type(0.0));
+            _static_cast<scalar_type>(0.0), _static_cast<scalar_type>(1.0)),
+         _static_cast<scalar_type>(0.0));
 
       return vector2_type(u, v);
    }
@@ -313,13 +252,16 @@ struct SphericalTriangle<T, true, Algorithm>
       return backwardPdf(L);
    }
 
-   // mirrored from base for uniform access across both specializations
    scalar_type rcpSolidAngle;
+   scalar_type cosA;
+   scalar_type sinA;
+   scalar_type sinA_triCosc; // precomputed sinA * triCosc
+   scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only
 
-   base_type base;
-   vector3_type vertexC;
-   vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC
-   scalar_type triCscC;
+   vector3_type tri_vertices[2]; // A and B only
+   scalar_type triCosc;
+   vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane
+   vector3_type APlusC; // precomputed A + C, used to pick the minor-arc intersection in generateInverse
 };
 
 } // namespace sampling
diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
index 21901a9628..0d22323921 100644
--- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
+++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl
@@ -35,10 +35,11 @@ struct UniformHemisphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		typename ConcentricMapping<T>::cache_type cmCache;
+		typename ConcentricMapping<scalar_type>::cache_type cmCache;
 		const vector_t2 p = ConcentricMapping<T>::generate(u, cmCache);
-		const T z = T(1.0) - cmCache.r2;
-		const T xyScale = hlsl::sqrt<T>(hlsl::max<T>(T(0.0), T(2.0) - cmCache.r2));
+		assert(cmCache.r2 <= _static_cast<scalar_type>(1.0));
+		const T z = _static_cast<T>(1.0) - cmCache.r2;
+		const T xyScale = hlsl::sqrt<scalar_type>(_static_cast<scalar_type>(2.0) - cmCache.r2);
 		return vector_t3(p.x * xyScale, p.y * xyScale, z);
 	}
 
@@ -50,30 +51,30 @@ struct UniformHemisphere
 	static domain_type generateInverse(const codomain_type v)
 	{
 		// r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z)
-		const T scale = T(1.0) / hlsl::sqrt<T>(T(1.0) + v.z);
+		const scalar_type scale = hlsl::rsqrt<T>(_static_cast<T>(1.0) + v.z);
 		return ConcentricMapping<T>::generateInverse(vector_t2(v.x * scale, v.y * scale));
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<scalar_type>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static density_type backwardPdf(const codomain_type v)
 	{
 		assert(v.z > 0);
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type v)
 	{
 		assert(v.z > 0);
-		return T(0.5) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.5) * numbers::inv_pi<T>;
 	}
 
 };
@@ -96,7 +97,7 @@ struct UniformSphere
 
 	static codomain_type generate(const domain_type u)
 	{
-		const T tmp = u.x * T(2.0) - T(1.0);
+		const T tmp = u.x * _static_cast<T>(2.0) - _static_cast<T>(1.0);
 		const codomain_type L = hemisphere_t::generate(vector_t2(hlsl::abs<T>(tmp), u.y));
 		return vector_t3(L.x, L.y, L.z * hlsl::sign(tmp));
 	}
@@ -108,29 +109,29 @@ struct UniformSphere
 
 	static domain_type generateInverse(const codomain_type v)
 	{
-		const T dir = hlsl::sign(v.z) * T(0.5);
+		const T dir = hlsl::sign(v.z) * _static_cast<T>(0.5);
 		const domain_type hemiU = hemisphere_t::generateInverse(vector_t3(v.x, v.y, hlsl::abs<T>(v.z)));
-		return vector_t2(hemiU.x * dir + T(0.5), hemiU.y);
+		return vector_t2(hemiU.x * dir + _static_cast<T>(0.5), hemiU.y);
 	}
 
 	static density_type forwardPdf(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * hemisphere_t::forwardPdf(u, cache);
+		return _static_cast<T>(0.5) * hemisphere_t::forwardPdf(u, cache);
 	}
 
 	static weight_type forwardWeight(const domain_type u, const cache_type cache)
 	{
-		return T(0.5) * hemisphere_t::forwardWeight(u, cache);
+		return _static_cast<T>(0.5) * hemisphere_t::forwardWeight(u, cache);
 	}
 
 	static density_type backwardPdf(const codomain_type v)
 	{
-		return T(0.25) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.25) * numbers::inv_pi<T>;
 	}
 
 	static weight_type backwardWeight(const codomain_type v)
 	{
-		return T(0.25) * numbers::inv_pi<T>;
+		return _static_cast<T>(0.25) * numbers::inv_pi<T>;
 	}
 
 };
diff --git a/include/nbl/builtin/hlsl/shapes/obb.hlsl b/include/nbl/builtin/hlsl/shapes/obb.hlsl
index bdddc48ebf..763d434518 100644
--- a/include/nbl/builtin/hlsl/shapes/obb.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/obb.hlsl
@@ -26,17 +26,17 @@ struct OBB
 			axesScale[dim_i] = axes[dim_i] * obbScale[dim_i];
 		}
 		OBB ret;
-		for (int row_i = 0; row_i < D; row_i++)
+		for (int16_t row_i = 0; row_i < D; row_i++)
 		{
-		  for (int col_i = 0; col_i < D; col_i++)
+		  for (int16_t col_i = 0; col_i < D; col_i++)
 		  {
 				ret.transform[row_i][col_i] = axesScale[col_i][row_i];
 		  }
 		}
-		for (int dim_i = 0; dim_i < D; dim_i++)
+		for (int16_t dim_i = 0; dim_i < D; dim_i++)
 		{
 			scalar_t sum = 0; 
-			for (int dim_j = 0; dim_j < D; dim_j++)
+			for (int16_t dim_j = 0; dim_j < D; dim_j++)
 			{
 				sum += axesScale[dim_j][dim_i];
 			}
@@ -49,7 +49,7 @@ struct OBB
 	NBL_CONSTEXPR_STATIC_INLINE OBB createAxisAligned(point_t mid, point_t len)
 	{
 		point_t axes[D];
-		for (auto dim_i = 0; dim_i < D; dim_i++)
+		for (int16_t dim_i = 0; dim_i < D; dim_i++)
 		{
 			axes[dim_i] = point_t(0);
 			axes[dim_i][dim_i] = 1;
@@ -60,6 +60,106 @@ struct OBB
 	matrix<scalar_t, D, D + 1> transform;
 };
 
+// Decomposed OBB view: caches columns and minCorner for fast vertex queries.
+// Same 12-float footprint as float32_t3x4, but laid out for branchless
+// corner computation.
+// Decomposed OBB view: caches columns and minCorner for fast vertex queries.
+// columns[i] are the full OBB edge vectors; minCorner is the world position
+// of corner 0b000 = center - 0.5*(col0+col1+col2).
+template<typename Scalar=float32_t>
+struct OBBView
+{
+	using scalar_t = Scalar;
+	using vec3_t = vector<Scalar, 3>;
+	using mat3_t = matrix<Scalar, 3, 3>;
+
+	mat3_t columns;
+	vec3_t minCorner;
+
+	static OBBView create(matrix<scalar_t, 3, 4> modelMatrix)
+	{
+		matrix<scalar_t, 4, 3> m = transpose(modelMatrix);
+		OBBView v;
+		v.columns = mat3_t(m[0].xyz, m[1].xyz, m[2].xyz);
+		v.minCorner = m[3].xyz - scalar_t(0.5) * (m[0].xyz + m[1].xyz + m[2].xyz);
+		return v;
+	}
+
+	vec3_t getCenter() NBL_CONST_MEMBER_FUNC
+	{
+		return minCorner + scalar_t(0.5) * (columns[0] + columns[1] + columns[2]);
+	}
+
+	vec3_t getVertex(uint32_t i) NBL_CONST_MEMBER_FUNC
+	{
+		vec3_t p = minCorner;
+		if (i & 1u) p += columns[0];
+		if (i & 2u) p += columns[1];
+		if (i & 4u) p += columns[2];
+		return p;
+	}
+
+	// Scalar-z specialization: only computes the z component of a corner.
+	// Used for clip-mask build where we only need the sign of z.
+	scalar_t getVertexZ(uint32_t i) NBL_CONST_MEMBER_FUNC
+	{
+		scalar_t pz = minCorner.z;
+		if (i & 1u) pz += columns[0].z;
+		if (i & 2u) pz += columns[1].z;
+		if (i & 4u) pz += columns[2].z;
+		return pz;
+	}
+
+	// Ray-OBB intersection via slab test in OBB-local space.
+	// Returns (tMin, tMax, hit). tMin is clamped to 0 if ray starts inside.
+	// TODO: not optimized -- precompute inverse columns, handle f~=0 edge case
+	struct Intersection
+	{
+		scalar_t tMin;
+		scalar_t tMax;
+		bool hit;
+	};
+
+	Intersection rayIntersection(vec3_t rayOrigin, vec3_t rayDir) NBL_CONST_MEMBER_FUNC
+	{
+		// Vector from ray origin to minCorner
+		vec3_t delta = rayOrigin - minCorner;
+
+		scalar_t tMin = scalar_t(-1e30);
+		scalar_t tMax = scalar_t(1e30);
+
+		// Slab test against each OBB axis
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			vec3_t axis = columns[i];
+			scalar_t axisLenSq = dot(axis, axis);
+			scalar_t e = dot(axis, delta);
+			scalar_t f = dot(axis, rayDir);
+
+			// Project ray onto axis: entry at t where dot = 0, exit where dot = axisLenSq
+			scalar_t invF = scalar_t(1.0) / f;
+			scalar_t t1 = (-e) * invF;
+			scalar_t t2 = (axisLenSq - e) * invF;
+
+			if (t1 > t2)
+			{
+				scalar_t tmp = t1;
+				t1 = t2;
+				t2 = tmp;
+			}
+
+			tMin = max(tMin, t1);
+			tMax = min(tMax, t2);
+		}
+
+		Intersection result;
+		result.hit = (tMax >= tMin) && (tMax > scalar_t(0));
+		result.tMin = max(tMin, scalar_t(0));
+		result.tMax = tMax;
+		return result;
+	}
+};
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/shapes/obb_silhouette.hlsl b/include/nbl/builtin/hlsl/shapes/obb_silhouette.hlsl
new file mode 100644
index 0000000000..83c668033c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/shapes/obb_silhouette.hlsl
@@ -0,0 +1,453 @@
+// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_SHAPES_OBB_SILHOUETTE_HLSL_INCLUDED_
+#define _NBL_BUILTIN_HLSL_SHAPES_OBB_SILHOUETTE_HLSL_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/intrinsics.hlsl>
+#include <nbl/builtin/hlsl/bit.hlsl>
+#include <nbl/builtin/hlsl/shapes/obb.hlsl>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace shapes
+{
+
+// Max vertices in an OBB silhouette after horizon clipping: a 6-vertex
+// silhouette can gain at most one extra vertex from the partial clip endpoints.
+NBL_CONSTEXPR uint32_t MaxOBBSilhouetteVertices = 7;
+
+// ============================================================================
+// 27-config silhouette table for an axis-aligned cube viewed from any of the
+// 27 region cells (3 per axis). Each entry: {count, v0, v1, v2, v3, v4, v5}
+// with vertices in CCW order relative to the viewer.
+// ============================================================================
+// Human-readable LUT kept for reference / debugging. The hot path uses the
+// packed binSilhouettes form below.
+static const uint32_t silhouettes[27][7] = {
+	{6, 1, 3, 2, 6, 4, 5}, // 0: Black
+	{6, 2, 6, 4, 5, 7, 3}, // 1: White
+	{6, 0, 4, 5, 7, 3, 2}, // 2: Gray
+	{6, 1, 3, 7, 6, 4, 5}, // 3: Red
+	{4, 4, 5, 7, 6, 0, 0}, // 4: Green
+	{6, 0, 4, 5, 7, 6, 2}, // 5: Blue
+	{6, 0, 1, 3, 7, 6, 4}, // 6: Yellow
+	{6, 0, 1, 5, 7, 6, 4}, // 7: Magenta
+	{6, 0, 1, 5, 7, 6, 2}, // 8: Cyan
+	{6, 1, 3, 2, 6, 7, 5}, // 9: Orange
+	{4, 2, 6, 7, 3, 0, 0}, // 10: Light Orange
+	{6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange
+	{4, 1, 3, 7, 5, 0, 0}, // 12: Pink
+	{4, 0, 4, 6, 7, 3, 2}, // 13: Light Pink
+	{4, 0, 4, 6, 2, 0, 0}, // 14: Deep Rose
+	{6, 0, 1, 3, 7, 5, 4}, // 15: Purple
+	{4, 0, 1, 5, 4, 0, 0}, // 16: Light Purple
+	{6, 0, 1, 5, 4, 6, 2}, // 17: Indigo
+	{6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green
+	{6, 0, 2, 6, 7, 3, 1}, // 19: Lime
+	{6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green
+	{6, 0, 2, 3, 7, 5, 1}, // 21: Navy
+	{4, 0, 2, 3, 1, 0, 0}, // 22: Sky Blue
+	{6, 0, 4, 6, 2, 3, 1}, // 23: Teal
+	{6, 0, 2, 3, 7, 5, 4}, // 24: Brown
+	{6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige
+	{6, 1, 5, 4, 6, 2, 3}, // 26: Dark Brown
+};
+
+// Binary packed silhouettes: bits 0-17 hold 6 corner indices (3 bits each),
+// bits 29-31 hold the vertex count. Hot-path uses this; LUT above is reference.
+static const uint32_t binSilhouettes[27] = {
+	0b11000000000000101100110010011001,
+	0b11000000000000011111101100110010,
+	0b11000000000000010011111101100000,
+	0b11000000000000101100110111011001,
+	0b10000000000000000000110111101100,
+	0b11000000000000010110111101100000,
+	0b11000000000000100110111011001000,
+	0b11000000000000100110111101001000,
+	0b11000000000000010110111101001000,
+	0b11000000000000101111110010011001,
+	0b10000000000000000000011111110010,
+	0b11000000000000010011111110100000,
+	0b10000000000000000000101111011001,
+	0b11000000000000010011111110100000,
+	0b10000000000000000000010110100000,
+	0b11000000000000100101111011001000,
+	0b10000000000000000000100101001000,
+	0b11000000000000010110100101001000,
+	0b11000000000000001101111110010000,
+	0b11000000000000001011111110010000,
+	0b11000000000000001011111110100000,
+	0b11000000000000001101111011010000,
+	0b10000000000000000000001011010000,
+	0b11000000000000001011010110100000,
+	0b11000000000000100101111011010000,
+	0b11000000000000100101001011010000,
+	0b11000000000000011010110100101001,
+};
+
+struct BinSilhouette
+{
+	static BinSilhouette create(uint32_t configIndex)
+	{
+		BinSilhouette s;
+		s.data = binSilhouettes[configIndex];
+		return s;
+	}
+
+	uint32_t getVertexIndex(uint32_t index) NBL_CONST_MEMBER_FUNC
+	{
+		return (data >> (3u * index)) & 0x7u;
+	}
+
+	uint32_t getVertexCount() NBL_CONST_MEMBER_FUNC
+	{
+		return (data >> 29u) & 0x7u;
+	}
+
+	void rotr(uint32_t shift, uint32_t size)
+	{
+		data = nbl::hlsl::rotr(data, shift, size);
+	}
+
+	void rotl(uint32_t shift, uint32_t size)
+	{
+		data = nbl::hlsl::rotl(data, shift, size);
+	}
+
+	uint32_t data;
+};
+
+// Metadata-only descriptor of a clipped OBB silhouette (12 bytes). Vertex
+// positions are NOT stored, consumers call materialize(view, verts) to
+// fill a local array on demand, keeping vec3 storage out of struct-passing.
+//
+// silData: bits 0-17 rotated 3-bit corner indices (positive-z corners first
+// in CCW order, then negative-z), bits 24-28 configIndex, bits 29-31 silhouette size.
+// positiveCount: positive-z corners surviving the clip.
+// count: emitted vertex count (positiveCount + 2 on partial clip, 0 if fully clipped).
+struct ClippedSilhouette
+{
+	uint32_t   silData;       // rotated BinSilhouette data + size
+	uint32_t   positiveCount; // # of positive-z OBB corners after rotation
+	uint32_t   count;         // total emitted vertex count consumers cascade on
+	float32_t3 shadingPoint;  // observer position; baked into clipping + materialize
+
+	static ClippedSilhouette create(shapes::OBBView<float32_t> view, float32_t3 shadingPoint)
+	{
+		uint32_t3 region;
+		uint32_t  configIndex, vertexCount;
+		// OBB-local observer coord along axis i is dot(col_i, shadingPoint - minCorner);
+		// compare against [0, |col_i|^2] for branchless 27-config classify.
+		const float32_t3 toMin   = view.minCorner - shadingPoint;
+		float32_t3 sqScales = float32_t3(dot(view.columns[0], view.columns[0]), dot(view.columns[1], view.columns[1]), dot(view.columns[2], view.columns[2]));
+		float32_t3 proj     = -float32_t3(dot(view.columns[0], toMin), dot(view.columns[1], toMin), dot(view.columns[2], toMin));
+
+		uint32_t3 below = uint32_t3(proj < float32_t3(0, 0, 0));
+		uint32_t3 above = uint32_t3(proj > sqScales);
+		region          = uint32_t3(uint32_t3(1u, 1u, 1u) + below - above);
+
+		configIndex = region.x + region.y * 3u + region.z * 9u;
+
+		BinSilhouette sil = BinSilhouette::create(configIndex);
+		vertexCount       = sil.getVertexCount();
+
+		// Always evaluate all 6 slots so the loop unrolls without a runtime
+		// branch on vertexCount; high bits are masked off below.
+		uint32_t validMask = (1u << vertexCount) - 1u;
+		uint32_t clipMask  = 0u;
+		NBL_UNROLL
+		for (uint32_t i = 0; i < 6; i++)
+			clipMask |= (hlsl::select(view.getVertexZ(sil.getVertexIndex(i)) < shadingPoint.z, 1u, 0u)) << i;
+		clipMask &= validMask;
+
+		uint32_t clipCount = countbits(clipMask);
+		uint32_t invertedMask = ~clipMask & validMask;
+
+		// clipMask is masked to validMask, so the shift can't pull garbage into bit 0.
+		bool wrapAround = (clipMask & (clipMask >> (vertexCount - 1))) != 0u;
+
+		uint32_t rotateAmount = nbl::hlsl::select(wrapAround, firstbitlow(invertedMask), // first positive
+			firstbithigh(clipMask) + 1); // first vertex after last negative
+
+		sil.rotr(rotateAmount * 3, vertexCount * 3);
+
+		ClippedSilhouette self;
+		// rotr wipes bits above width, so re-inject vertexCount and pack configIndex.
+		self.silData       = sil.data | (configIndex << 24u) | (vertexCount << 29u);
+		self.positiveCount = vertexCount - clipCount;
+		const bool fullyClipped = (clipCount == vertexCount);
+		const bool partialClip  = (clipCount > 0) && !fullyClipped;
+		self.count              = nbl::hlsl::select(fullyClipped, 0u, self.positiveCount + (partialClip ? 2u : 0u));
+		self.shadingPoint       = shadingPoint;
+
+		return self;
+	}
+
+	uint32_t cornerIndex(uint32_t k) NBL_CONST_MEMBER_FUNC
+	{
+		return (silData >> (3u * k)) & 0x7u;
+	}
+
+	uint32_t  getVertexCount() NBL_CONST_MEMBER_FUNC { return (silData >> 29u) & 0x7u; }
+	uint32_t  getConfigIndex() NBL_CONST_MEMBER_FUNC { return (silData >> 24u) & 0x1Fu; }
+	uint32_t3 getRegion() NBL_CONST_MEMBER_FUNC
+	{
+		const uint32_t ci = getConfigIndex();
+		return uint32_t3(ci % 3u, (ci / 3u) % 3u, ci / 9u);
+	}
+	BinSilhouette getOriginalBinSilhouette() NBL_CONST_MEMBER_FUNC { return BinSilhouette::create(getConfigIndex()); }
+
+	// Fill `count` vertices into the caller's local array. Each vertex is
+	// view.getVertex(cornerIndex(K)), columns[0/1/2] indexed by literal so
+	// SROA keeps them in registers and the 3 conditional adds run in parallel.
+	// Cascade on count rather than for+break so every vertices[K] write uses
+	// a literal slot index, otherwise the array demotes to Function memory.
+	// Vertices are returned in shading-point-relative coordinates (i.e.
+	// view.getVertex(...) - shadingPoint), so direction-from-shading-point
+	// reductions in consumers (cross/dot, gnomonic projection, horizon clip
+	// to z=0) are correct.
+	void materialize(shapes::OBBView<float32_t> view, out float32_t3 vertices[MaxOBBSilhouetteVertices]) NBL_CONST_MEMBER_FUNC
+	{
+		// Zero the unused tail; some consumers (DCE sinks) read
+		// the full 7-wide array.
+		NBL_UNROLL
+		for (uint32_t init = 0; init < MaxOBBSilhouetteVertices; init++)
+			vertices[init] = float32_t3(0.0f, 0.0f, 0.0f);
+		if (count == 0)
+			return;
+
+		vertices[0] = view.getVertex(cornerIndex(0)) - shadingPoint;
+		if (positiveCount > 1)
+		{
+			vertices[1] = view.getVertex(cornerIndex(1)) - shadingPoint;
+			if (positiveCount > 2)
+			{
+				vertices[2] = view.getVertex(cornerIndex(2)) - shadingPoint;
+				if (positiveCount > 3)
+				{
+					vertices[3] = view.getVertex(cornerIndex(3)) - shadingPoint;
+					if (positiveCount > 4)
+					{
+						vertices[4] = view.getVertex(cornerIndex(4)) - shadingPoint;
+						if (positiveCount > 5)
+						{
+							vertices[5] = view.getVertex(cornerIndex(5)) - shadingPoint;
+							if (positiveCount > 6)
+								vertices[6] = view.getVertex(cornerIndex(6)) - shadingPoint;
+						}
+					}
+				}
+			}
+		}
+
+		// Partial-clip: two extra getVertex calls for the negative-z endpoints
+		// around the positive run, lerped to z=0 (in shading-point-relative
+		// frame). Cascaded for literal slot indices.
+		if (count > positiveCount)
+		{
+			const uint32_t   silSize   = (silData >> 29u) & 0x7u;
+			const float32_t3 vFirstNeg = view.getVertex(cornerIndex(positiveCount)) - shadingPoint;
+			const float32_t3 vLastNeg  = view.getVertex(cornerIndex(silSize - 1u)) - shadingPoint;
+			const float32_t3 vFirstPos = vertices[0];
+
+			if (positiveCount == 1)
+			{
+				const float32_t3 vLastPos = vertices[0];
+				const float32_t  tA       = vLastPos.z / (vLastPos.z - vFirstNeg.z);
+				vertices[1]               = lerp(vLastPos, vFirstNeg, tA);
+				const float32_t tB        = vLastNeg.z / (vLastNeg.z - vFirstPos.z);
+				vertices[2]               = lerp(vLastNeg, vFirstPos, tB);
+			}
+			else if (positiveCount == 2)
+			{
+				const float32_t3 vLastPos = vertices[1];
+				const float32_t  tA       = vLastPos.z / (vLastPos.z - vFirstNeg.z);
+				vertices[2]               = lerp(vLastPos, vFirstNeg, tA);
+				const float32_t tB        = vLastNeg.z / (vLastNeg.z - vFirstPos.z);
+				vertices[3]               = lerp(vLastNeg, vFirstPos, tB);
+			}
+			else if (positiveCount == 3)
+			{
+				const float32_t3 vLastPos = vertices[2];
+				const float32_t  tA       = vLastPos.z / (vLastPos.z - vFirstNeg.z);
+				vertices[3]               = lerp(vLastPos, vFirstNeg, tA);
+				const float32_t tB        = vLastNeg.z / (vLastNeg.z - vFirstPos.z);
+				vertices[4]               = lerp(vLastNeg, vFirstPos, tB);
+			}
+			else if (positiveCount == 4)
+			{
+				const float32_t3 vLastPos = vertices[3];
+				const float32_t  tA       = vLastPos.z / (vLastPos.z - vFirstNeg.z);
+				vertices[4]               = lerp(vLastPos, vFirstNeg, tA);
+				const float32_t tB        = vLastNeg.z / (vLastNeg.z - vFirstPos.z);
+				vertices[5]               = lerp(vLastNeg, vFirstPos, tB);
+			}
+			else // positiveCount == 5; positiveCount == 6 -> count == 8 > 7, impossible
+			{
+				const float32_t3 vLastPos = vertices[4];
+				const float32_t  tA       = vLastPos.z / (vLastPos.z - vFirstNeg.z);
+				vertices[5]               = lerp(vLastPos, vFirstNeg, tA);
+				const float32_t tB        = vLastNeg.z / (vLastNeg.z - vFirstPos.z);
+				vertices[6]               = lerp(vLastNeg, vFirstPos, tB);
+			}
+		}
+	}
+
+	// materialize + per-vertex normalize. Cascaded for literal slot indices.
+	void materializeNormalized(shapes::OBBView<float32_t> view, out float32_t3 vertices[MaxOBBSilhouetteVertices]) NBL_CONST_MEMBER_FUNC
+	{
+		materialize(view, vertices);
+		vertices[0] = nbl::hlsl::normalize(vertices[0]);
+		if (count > 1)
+		{
+			vertices[1] = nbl::hlsl::normalize(vertices[1]);
+			if (count > 2)
+			{
+				vertices[2] = nbl::hlsl::normalize(vertices[2]);
+				if (count > 3)
+				{
+					vertices[3] = nbl::hlsl::normalize(vertices[3]);
+					if (count > 4)
+					{
+						vertices[4] = nbl::hlsl::normalize(vertices[4]);
+						if (count > 5)
+						{
+							vertices[5] = nbl::hlsl::normalize(vertices[5]);
+							if (count > 6)
+								vertices[6] = nbl::hlsl::normalize(vertices[6]);
+						}
+					}
+				}
+			}
+		}
+	}
+};
+
+struct SilEdgeNormals
+{
+	// Sentinel for unused edge slots: dot(dir, (0,0,-1)) = -dir.z. Callers
+	// gate isInside on dir.z > 0, so this dot is always negative for them,
+	// its asuint has the sign bit set, which makes the bitwise-AND
+	// reduction in isInside() pass through the real sign bits unchanged.
+	static SilEdgeNormals initSentinel()
+	{
+		SilEdgeNormals result;
+		NBL_UNROLL
+		for (uint32_t i = 0; i < MaxOBBSilhouetteVertices; i++)
+			result.edgeNormals[i] = float32_t3(0.0f, 0.0f, -1.0f);
+		return result;
+	}
+
+	// Build per-edge cross products from a materialized vertex array.
+	static SilEdgeNormals create(float32_t3 vertices[MaxOBBSilhouetteVertices], uint32_t count)
+	{
+		SilEdgeNormals result = initSentinel();
+
+		float32_t3 v0 = vertices[0];
+		float32_t3 v1 = vertices[1];
+		float32_t3 v2 = vertices[2];
+
+		result.edgeNormals[0] = cross(v0, v1);
+		result.edgeNormals[1] = cross(v1, v2);
+
+		if (count > 3)
+		{
+			float32_t3 v3         = vertices[3];
+			result.edgeNormals[2] = cross(v2, v3);
+
+			if (count > 4)
+			{
+				float32_t3 v4         = vertices[4];
+				result.edgeNormals[3] = cross(v3, v4);
+
+				if (count > 5)
+				{
+					float32_t3 v5         = vertices[5];
+					result.edgeNormals[4] = cross(v4, v5);
+
+					if (count > 6)
+					{
+						float32_t3 v6         = vertices[6];
+						result.edgeNormals[5] = cross(v5, v6);
+						result.edgeNormals[6] = cross(v6, v0);
+					}
+					else
+					{
+						result.edgeNormals[5] = cross(v5, v0);
+					}
+				}
+				else
+				{
+					result.edgeNormals[4] = cross(v4, v0);
+				}
+			}
+			else
+			{
+				result.edgeNormals[3] = cross(v3, v0);
+			}
+		}
+		else
+		{
+			result.edgeNormals[2] = cross(v2, v0);
+		}
+
+		return result;
+	}
+
+	// Sign-bit AND reduction: dot <= 0 iff asuint(dot) sign bit set (modulo +0.0
+	// exact-boundary samples, which never hit in practice). 6 ANDs on the INT
+	// pipe instead of 6 fmaxes on the FP pipe; lets the FP pipe stay busy with
+	// the 7 dot products on Ampere's split FP/INT scheduler.
+	bool isInside(float32_t3 dir)
+	{
+		const float32_t d0 = hlsl::dot(dir, edgeNormals[0]);
+		const float32_t d1 = hlsl::dot(dir, edgeNormals[1]);
+		const float32_t d2 = hlsl::dot(dir, edgeNormals[2]);
+		const float32_t d3 = hlsl::dot(dir, edgeNormals[3]);
+		const float32_t d4 = hlsl::dot(dir, edgeNormals[4]);
+		const float32_t d5 = hlsl::dot(dir, edgeNormals[5]);
+		const float32_t d6 = hlsl::dot(dir, edgeNormals[6]);
+		const uint32_t allNeg = asuint(d0) & asuint(d1) & asuint(d2) & asuint(d3) & asuint(d4) & asuint(d5) & asuint(d6);
+		return (allNeg & 0x80000000u) != 0u;
+	}
+
+	// Transform edge normals from world-space to the pyramid's local frame in-place.
+	// After this, edgeNormals[i] = (dot(n, axis1), dot(n, axis2), dot(n, axis3))
+	// and isInsideLocal() can do 2-FMA half-plane tests without extra storage.
+	// NOTE: destroys world-space normals, isInside() will no longer work correctly.
+	void transformToLocal(float32_t3 axis1, float32_t3 axis2, float32_t3 axis3)
+	{
+		NBL_UNROLL
+		for (uint32_t i = 0; i < MaxOBBSilhouetteVertices; i++)
+		{
+			float32_t3 n   = edgeNormals[i];
+			edgeNormals[i] = float32_t3(dot(n, axis1), dot(n, axis2), dot(n, axis3));
+		}
+	}
+
+	// 2D gnomonic containment test after transformToLocal().
+	//   dot(dir_unnorm, n_local) = localX * n.x + localY * n.y + n.z
+	bool isInsideLocal(float32_t localX, float32_t localY)
+	{
+		float32_t maxDot = localX * edgeNormals[0].x + localY * edgeNormals[0].y + edgeNormals[0].z;
+		maxDot           = hlsl::max(maxDot, localX * edgeNormals[1].x + localY * edgeNormals[1].y + edgeNormals[1].z);
+		maxDot           = hlsl::max(maxDot, localX * edgeNormals[2].x + localY * edgeNormals[2].y + edgeNormals[2].z);
+		maxDot           = hlsl::max(maxDot, localX * edgeNormals[3].x + localY * edgeNormals[3].y + edgeNormals[3].z);
+		maxDot           = hlsl::max(maxDot, localX * edgeNormals[4].x + localY * edgeNormals[4].y + edgeNormals[4].z);
+		maxDot           = hlsl::max(maxDot, localX * edgeNormals[5].x + localY * edgeNormals[5].y + edgeNormals[5].z);
+		maxDot           = hlsl::max(maxDot, localX * edgeNormals[6].x + localY * edgeNormals[6].y + edgeNormals[6].z);
+		return maxDot <= 0.0f;
+	}
+
+	float32_t3 edgeNormals[MaxOBBSilhouetteVertices];
+};
+
+}
+}
+}
+
+#endif
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
index 5895e4bc80..7ffe2ef407 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl
@@ -94,6 +94,8 @@ struct SphericalRectangle
         vector4_type cosGamma;
     };
 
+    // TODO: take an observer already, this way we can precompute and store the `r0`, `denorm_n_z` and `rcpLen_denorm_n_z`
+    // we need all of the above for solid angle and projected solid angle computation
     static SphericalRectangle<Scalar> create(NBL_CONST_REF_ARG(CompressedSphericalRectangle<Scalar>) compressed)
     {
         SphericalRectangle<scalar_type> retval;
@@ -102,7 +104,8 @@ struct SphericalRectangle
         retval.basis[0] = compressed.right / retval.extents[0];
         retval.basis[1] = compressed.up / retval.extents[1];
         assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5));
-        retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1]));
+        // don't normalize, the `right` and `up` vectors are orthogonal and the two bases are normalized already!
+        retval.basis[2] = hlsl::cross(retval.basis[0], retval.basis[1]);
         return retval;
     }
 
@@ -112,7 +115,8 @@ struct SphericalRectangle
         result.r0 = hlsl::mul(basis, origin - observer);
 
         const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x);
-        result.n_z = denorm_n_z * hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        const vector4_type rcpLen_denorm_n_z = hlsl::rsqrt<vector4_type>(hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z);
+        result.n_z = denorm_n_z * rcpLen_denorm_n_z;
         result.cosGamma = vector4_type(
             -result.n_z[0] * result.n_z[1],
             -result.n_z[1] * result.n_z[2],
@@ -128,17 +132,17 @@ struct SphericalRectangle
     }
 
     // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega
+    // TODO: don't take the observer, observer should be taken at creation
     scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC
     {
         return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal));
     }
 
-    // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)).
-    // Exploits rectangle structure: all 4 corners share the same z, so cross products
-    // have only 2 nonzero components each, and externalProducts can be computed without
-    // normalizing the corner directions.
+    // TODO: only take a `localN`
     scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC
     {
+        // FUN FACT: `n_z` already holds Z coordinate the NORMALIZED `awayFromEdgePlane`, the non-zero coordinate absolute value is equal to `r0.z * rcpLen_denorm_n_z`
+// TODO: skip all this code until  just call `acos` on the `unnormDots`
         const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z;
         const scalar_type x1 = x0 + extents.x;
         const scalar_type y1 = y0 + extents.y;
@@ -154,8 +158,9 @@ struct SphericalRectangle
             zSq + x1 * x1,
             zSq + y1 * y1,
             zSq + x0 * x0
-        );
+        ); // TODO: this is already computed as `rcpLen_denorm_n_z`
 
+// TODO: this can be computed from `denorm_n_z`, `z` and `rcpLen_denorm_n_z` instead
         // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel
         const vector4_type crossDotN = vector4_type(
             z * n.y - y0 * n.z,
@@ -164,8 +169,12 @@ struct SphericalRectangle
             z * n.x - x0 * n.z
         );
         // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle).
-        const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt<vector4_type>(crossLenSq);
+        const vector4_type externalProducts = crossDotN * hlsl::rsqrt<vector4_type>(crossLenSq);
 
+// TODO: isn't `rcpLen_denorm_n_z` related to the sin^-1() of arclengths ? Wouldn't `ACOS_CSC` apply instead  of `acos*rsqrt(1-cos^2)`
+// wouldn't then `hlsl::promote<vector4_type>(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z` be the sin^2 ?
+// it would probably have to be a different, here's the `acos(sqrt(1-x*x))/x` curve fit again revealed to me in a dream
+//  https://www.desmos.com/calculator/sbdrulot5a = exp2(-1.6*sqrt(1-x*x))*A+B
         // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|)
         const vector4_type lenSq = vector4_type(
             x0 * x0 + y0 * y0,
@@ -184,9 +193,11 @@ struct SphericalRectangle
         // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0)
         const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx;
 
+        // TODO: there's the same opportunity for optimization of this as the Spherical Triangle
+        // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E
         const vector4_type pyramidAngles = hlsl::acos<vector4_type>(cos_sides);
 
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        return hlsl::abs(hlsl::dot(pyramidAngles, externalProducts)) * scalar_type(0.5);
     }
 
     vector3_type origin;
diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
index 1a5681b39e..b8a2c7229e 100644
--- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
+++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl
@@ -20,6 +20,26 @@ namespace hlsl
 namespace shapes
 {
 
+// TODO: move to where fast_acos lives
+template<typename T, int order=2>
+T acos_csc_approx(const T arg)
+{
+    const T u = hlsl::log2(_static_cast<T>(1)+arg);
+    // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial
+    // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
+    // `F(0) = 0` and
+    // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
+    // See https://www.desmos.com/calculator/sdptomhbju
+    // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
+    T poly;
+    // TODO: actually optimize these constants in real world scenarios (renders)
+    if (order==1)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.6);
+    else if (order==2)
+        poly = (_static_cast<T>(1)-u)*_static_cast<T>(0.637)+(_static_cast<T>(1) - u * u) * _static_cast<T>(0.0115);
+    return hlsl::exp2<T>(poly);
+}
+
 template<typename T>
 struct SphericalTriangle
 {
@@ -54,6 +74,7 @@ struct SphericalTriangle
         // degenerate triangle: any side has near-zero sin, so csc blows up
         if (hlsl::any<vector<bool, 3> >(retval.csc_sides >= hlsl::promote<vector3_type>(numeric_limits<scalar_type>::max)))
         {
+            // TODO: can't do this, still need to be able to sample thin triangle like a line light, so need to know all the angles which are still valid
             retval.cos_vertices = hlsl::promote<vector3_type>(0.0);
             retval.sin_vertices = hlsl::promote<vector3_type>(0.0);
             retval.solid_angle = 0;
@@ -86,35 +107,29 @@ struct SphericalTriangle
         if (solid_angle <= numeric_limits<scalar_type>::epsilon)
             return 0;
 
-        matrix<scalar_type, 3, 3> awayFromEdgePlane;
-        awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0];
-        awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1];
-        awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2];
+        // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))`
+#define ACOS_CSC(I) acos_csc_approx(cos_sides[I])
+//#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I]
+        scalar_type externalProductsWeightedByPyramidAngles = hlsl::dot(hlsl::cross(vertices[1], vertices[2]),receiverNormal) * ACOS_CSC(0);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[2], vertices[0]),receiverNormal) * ACOS_CSC(1);
+        externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[0], vertices[1]),receiverNormal) * ACOS_CSC(2);
+#undef ACOS_CSC
+
         // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF.
+        // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle.
         // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge
         // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again.
-        const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal));
-
-        // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))`
-        // We could skip the `csc_sides` factor, and computing `pyramidAngles` and replace them with this approximation weighting before the dot product with the receiver notmal
-        // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial, so far I've calculated `F = (1-u)0.635+(1-u^2)0.0118` which gives <5% error until 165 degrees
-        // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties:
-        // `F(0) = 0` and
-        // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy
-        // See https://www.desmos.com/calculator/sdptomhbju
-        // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments
-        const vector3_type pyramidAngles = hlsl::acos<vector3_type>(cos_sides);
-        // So that triangle covering almost whole hemisphere sums to PI
-        return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5);
+        // The 0.5 is so that triangle covering almost whole hemisphere sums to PI
+        return externalProductsWeightedByPyramidAngles * scalar_type(0.5);
     }
 
     vector3_type vertices[3];
     // angles of vertices with origin, so the sides are INSIDE the sphere
     vector3_type cos_sides;
-    vector3_type csc_sides;
+    vector3_type csc_sides; // TODO: spherical triangle sampling only needs `csc_sides[1]` and possibly `csc_sides[2]`
     // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex
     vector3_type cos_vertices;
-    vector3_type sin_vertices;
+    vector3_type sin_vertices; // TODO: spherical triangle sampling only needs `sin_vertices[0]` a.k.a `sinA`
     scalar_type solid_angle;
 };
 
diff --git a/include/nbl/core/sampling/RandomSampler.h b/include/nbl/core/sampling/RandomSampler.h
index 39832dc8f1..b692ef5e08 100644
--- a/include/nbl/core/sampling/RandomSampler.h
+++ b/include/nbl/core/sampling/RandomSampler.h
@@ -11,8 +11,8 @@
 namespace nbl::core
 {
 
-class RandomSampler
-{
+	class RandomSampler
+	{
 	public:
 		RandomSampler(uint32_t _seed)
 		{
@@ -25,9 +25,24 @@ class RandomSampler
 			return mersenneTwister();
 		}
 
+		// Returns a float in [0, 1)
+		inline float nextFloat()
+		{
+			// 1 / 2^32
+			constexpr float norm = 1.0f / 4294967296.0f;
+			return mersenneTwister() * norm;
+		}
+
+		// Returns a float in [min, max)
+		inline float nextFloat(float min, float max)
+		{
+			constexpr float norm = 1.0f / 4294967296.0f;
+			return min + (mersenneTwister() * norm) * (max - min);
+		}
+
 	protected:
 		std::mt19937 mersenneTwister;
-};
+	};
 
 
 }
diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
index 39013417dc..53cef98d71 100644
--- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
+++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h
@@ -24,11 +24,18 @@ struct ProtoPipeline final
 			const video::IGPURenderpass* renderpass,
 			const uint32_t subpassIx=0,
 			asset::SBlendParams blendParams = {},
+			asset::SRasterizationParams rasterizationParams = DefaultRasterParams,
 			const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform=hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT,
 			video::IGPUPipelineCache* pipelineCache = nullptr
 		);
 
 		core::smart_refctd_ptr<asset::IShader> m_vxShader;
+
+		constexpr static inline asset::SRasterizationParams DefaultRasterParams = {
+			.faceCullingMode = asset::EFCM_NONE,
+			.depthWriteEnable = false,
+			.depthCompareOp = asset::ECO_ALWAYS
+		};
 };
 
 bool recordDrawCall(video::IGPUCommandBuffer* commandBuffer);
diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h
index aeae8c866a..6e2d299cec 100644
--- a/include/nbl/video/IGPUPipeline.h
+++ b/include/nbl/video/IGPUPipeline.h
@@ -12,6 +12,8 @@
 #include "nbl/asset/IPipeline.h"
 #include "nbl/system/to_string.h"
 
+#include <cstddef>
+
 namespace nbl::video
 {
 
@@ -128,6 +130,58 @@ class IGPUPipelineBase {
 
         using SShaderEntryMap = SShaderSpecInfo::entry_map_t;
 
+        // One row of VK_KHR_pipeline_executable_properties statistics, kept as the driver
+        // returned it (no formatting, no string scraping). The string field on
+        // SExecutableInfo below is still populated -- this is a parallel, structured view
+        // for callers that need numbers (benchmarks, regressions, baseline diffs).
+        struct SExecutableStatistic
+        {
+            enum class FORMAT : uint8_t { BOOL32, INT64, UINT64, FLOAT64 };
+            std::string name;
+            std::string description;
+            FORMAT      format = FORMAT::UINT64;
+            union Value
+            {
+                bool     b32;
+                int64_t  i64;
+                uint64_t u64;
+                double   f64;
+            } value = {};
+
+            // Convenience: collapse to a uint64_t regardless of the original format.
+            // Matches what most consumers want -- counters, sizes, register tallies.
+            inline uint64_t asUint() const
+            {
+                switch (format)
+                {
+                    case FORMAT::BOOL32:  return value.b32 ? 1u : 0u;
+                    case FORMAT::INT64:   return value.i64 < 0 ? 0u : uint64_t(value.i64);
+                    case FORMAT::UINT64:  return value.u64;
+                    case FORMAT::FLOAT64: return value.f64 < 0.0 ? 0u : uint64_t(value.f64);
+                }
+                return 0u;
+            }
+        };
+
+        // One IR (e.g. SPIR-V, AMD ISA, NV SASS) returned by the driver. Text payloads
+        // are stored without the trailing NUL the driver adds; binary payloads are kept
+        // verbatim. Lets callers dump to a file or hash the data without re-parsing the
+        // pretty-printed blob.
+        struct SInternalRepresentation
+        {
+            std::string             name;
+            std::string             description;
+            bool                    isText = false;
+            core::vector<std::byte> data;
+
+            // View text payloads as a string_view without copying. Empty for binary IRs.
+            inline std::string_view asText() const
+            {
+                if (!isText || data.empty()) return {};
+                return std::string_view(reinterpret_cast<const char*>(data.data()), data.size());
+            }
+        };
+
         // Per-executable info from VK_KHR_pipeline_executable_properties
         struct SExecutableInfo
         {
@@ -135,8 +189,10 @@ class IGPUPipelineBase {
             std::string description;
             core::bitflag<hlsl::ShaderStage> stages = hlsl::ShaderStage::ESS_UNKNOWN;
             uint32_t subgroupSize = 0;
-            std::string statistics;
-            std::string internalRepresentations;
+            std::string statistics;                                              // human-readable, aligned columns; what users log today
+            core::vector<SExecutableStatistic> structuredStatistics;             // same data, structured; for programmatic use
+            std::string internalRepresentations;                                 // human-readable concatenation of IRs (textual ones inline, binaries as "[binary data, N bytes]")
+            core::vector<SInternalRepresentation> structuredInternalRepresentations; // same data, structured; for programmatic use (file dump, hashing, etc.)
         };
 
         inline std::span<const SExecutableInfo> getExecutableInfo() const { return m_executableInfo; }
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 12b4af1bef..a6985b35d1 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -269,6 +269,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/spherical_triangle.hls
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/spherical_rectangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/aabb.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/obb.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/obb_silhouette.hlsl")
 #sampling
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/basic.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/linear.hlsl")
@@ -281,6 +282,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_triangle.h
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_triangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_rectangle.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_rectangle.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_pyramid.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted_spheres.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_pdf.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_weight.hlsl")
diff --git a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
index 58b1f2ea84..55dea2eb00 100644
--- a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
+++ b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp
@@ -84,6 +84,7 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 	const IGPURenderpass* renderpass,
 	const uint32_t subpassIx,
 	SBlendParams blendParams,
+	asset::SRasterizationParams rasterizationParams,
 	const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform,
 	IGPUPipelineCache* pipelineCache)
 {
@@ -94,11 +95,6 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 
 	smart_refctd_ptr<IGPUGraphicsPipeline> m_retval;
 	{
-		constexpr SRasterizationParams defaultRasterParams = {
-			.faceCullingMode = EFCM_NONE,
-			.depthWriteEnable = false,
-			.depthCompareOp = ECO_ALWAYS
-		};
 		const auto orientationAsUint32 = static_cast<uint32_t>(swapchainTransform);
 
 		IGPUPipelineBase::SShaderEntryMap specConstants;
@@ -111,7 +107,7 @@ smart_refctd_ptr<IGPUGraphicsPipeline> ProtoPipeline::createPipeline(
 		params[0].cached = {
 			.vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state
 			.primitiveAssembly = {},
-			.rasterization = defaultRasterParams,
+			.rasterization = rasterizationParams,
 			.blend = blendParams,
 			.subpassIx = subpassIx
 		};
diff --git a/src/nbl/video/CVulkanPipelineExecutableInfo.h b/src/nbl/video/CVulkanPipelineExecutableInfo.h
index 4cdf1f194f..79f1d8d573 100644
--- a/src/nbl/video/CVulkanPipelineExecutableInfo.h
+++ b/src/nbl/video/CVulkanPipelineExecutableInfo.h
@@ -6,6 +6,8 @@
 
 #include <volk.h>
 
+#include <cstring>
+
 namespace nbl::video
 {
 
@@ -16,7 +18,8 @@ inline void populateExecutableInfoFromVulkan(core::vector<IGPUPipelineBase::SExe
 
 	// Enumerate executables
 	uint32_t executableCount = 0;
-	vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, nullptr);
+	if (vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, nullptr) != VK_SUCCESS)
+		return;
 
 	if (executableCount == 0)
 		return;
@@ -24,7 +27,8 @@ inline void populateExecutableInfoFromVulkan(core::vector<IGPUPipelineBase::SExe
 	core::vector<VkPipelineExecutablePropertiesKHR> properties(executableCount);
 	for (uint32_t i = 0; i < executableCount; ++i)
 		properties[i] = {VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_PROPERTIES_KHR, nullptr};
-	vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, properties.data());
+	if (vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, properties.data()) != VK_SUCCESS)
+		return;
 
 	outInfo.resize(executableCount);
 
@@ -43,53 +47,75 @@ inline void populateExecutableInfoFromVulkan(core::vector<IGPUPipelineBase::SExe
 		execInfo.executableIndex = i;
 
 		uint32_t statCount = 0;
-		vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, nullptr);
+		if (vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, nullptr) != VK_SUCCESS)
+			statCount = 0;
 
 		if (statCount > 0)
 		{
 			core::vector<VkPipelineExecutableStatisticKHR> stats(statCount);
 			for (uint32_t s = 0; s < statCount; ++s)
 				stats[s] = {VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_STATISTIC_KHR, nullptr};
-			vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, stats.data());
+			if (vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, stats.data()) != VK_SUCCESS)
+				statCount = 0;
 
-			// First pass: format name:value pairs and find max width for alignment
-			core::vector<std::string> nameValues(statCount);
-			size_t maxNameValueLen = 0;
-			for (uint32_t s = 0; s < statCount; ++s)
+			if (statCount > 0)
 			{
-				const auto& stat = stats[s];
-				std::string value;
-				switch (stat.format)
+				info.structuredStatistics.resize(statCount);
+
+				// First pass: format name:value pairs (for the human-readable string) and
+				// fill structuredStatistics in lockstep so callers can pick whichever view
+				// they need without re-parsing.
+				core::vector<std::string> nameValues(statCount);
+				size_t maxNameValueLen = 0;
+				for (uint32_t s = 0; s < statCount; ++s)
 				{
-					case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:
-						value = stat.value.b32 ? "true" : "false";
-						break;
-					case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
-						value = std::to_string(stat.value.i64);
-						break;
-					case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
-						value = std::to_string(stat.value.u64);
-						break;
-					case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
-						value = std::to_string(stat.value.f64);
-						break;
-					default:
-						value = "<unknown format>";
-						break;
+					const auto& stat = stats[s];
+					auto& outStat = info.structuredStatistics[s];
+					outStat.name        = stat.name;
+					outStat.description = stat.description;
+
+					std::string value;
+					switch (stat.format)
+					{
+						case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:
+							outStat.format    = IGPUPipelineBase::SExecutableStatistic::FORMAT::BOOL32;
+							outStat.value.b32 = stat.value.b32 != VK_FALSE;
+							value = outStat.value.b32 ? "true" : "false";
+							break;
+						case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
+							outStat.format    = IGPUPipelineBase::SExecutableStatistic::FORMAT::INT64;
+							outStat.value.i64 = stat.value.i64;
+							value = std::to_string(stat.value.i64);
+							break;
+						case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
+							outStat.format    = IGPUPipelineBase::SExecutableStatistic::FORMAT::UINT64;
+							outStat.value.u64 = stat.value.u64;
+							value = std::to_string(stat.value.u64);
+							break;
+						case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
+							outStat.format    = IGPUPipelineBase::SExecutableStatistic::FORMAT::FLOAT64;
+							outStat.value.f64 = stat.value.f64;
+							value = std::to_string(stat.value.f64);
+							break;
+						default:
+							// Unknown format: leave structured value zero, keep raw text marker
+							value = "<unknown format>";
+							break;
+					}
+					nameValues[s] = std::string(stat.name) + ": " + value;
+					maxNameValueLen = std::max(maxNameValueLen, nameValues[s].size());
 				}
-				nameValues[s] = std::string(stat.name) + ": " + value;
-				maxNameValueLen = std::max(maxNameValueLen, nameValues[s].size());
-			}
 
-			// Second pass: emit with aligned columns
-			std::string& statsStr = info.statistics;
-			for (uint32_t s = 0; s < statCount; ++s)
-			{
-				statsStr += nameValues[s];
-				statsStr.append(maxNameValueLen - nameValues[s].size() + 4, ' ');
-				statsStr += "// ";
-				statsStr += stats[s].description;
-				statsStr += "\n";
+				// Second pass: emit with aligned columns (unchanged human-readable format)
+				std::string& statsStr = info.statistics;
+				for (uint32_t s = 0; s < statCount; ++s)
+				{
+					statsStr += nameValues[s];
+					statsStr.append(maxNameValueLen - nameValues[s].size() + 4, ' ');
+					statsStr += "// ";
+					statsStr += stats[s].description;
+					statsStr += "\n";
+				}
 			}
 		}
 
@@ -97,7 +123,8 @@ inline void populateExecutableInfoFromVulkan(core::vector<IGPUPipelineBase::SExe
 		if (includeInternalRepresentations)
 		{
 			uint32_t irCount = 0;
-			vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, nullptr);
+			if (vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, nullptr) != VK_SUCCESS)
+				irCount = 0;
 
 			if (irCount > 0)
 			{
@@ -106,7 +133,8 @@ inline void populateExecutableInfoFromVulkan(core::vector<IGPUPipelineBase::SExe
 					irs[r] = {VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INTERNAL_REPRESENTATION_KHR, nullptr};
 
 				// First call to get sizes
-				vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data());
+				if (vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data()) != VK_SUCCESS)
+					continue;
 
 				// Allocate data buffers and second call to get data
 				core::vector<core::vector<char>> irData(irCount);
@@ -116,26 +144,41 @@ inline void populateExecutableInfoFromVulkan(core::vector<IGPUPipelineBase::SExe
 					irs[r].pData = irData[r].data();
 				}
 
-				vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data());
+				if (vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data()) != VK_SUCCESS)
+					continue;
+
+				info.structuredInternalRepresentations.resize(irCount);
 
 				std::string& irStr = info.internalRepresentations;
 				for (uint32_t r = 0; r < irCount; ++r)
 				{
+					auto& outIr = info.structuredInternalRepresentations[r];
+					outIr.name        = irs[r].name;
+					outIr.description = irs[r].description;
+					outIr.isText      = irs[r].isText != VK_FALSE;
+					// Text payloads include a trailing NUL per the spec; drop it from the
+					// structured copy so asText().size() matches the textual length.
+					const size_t rawSize  = irs[r].dataSize;
+					const size_t copySize = outIr.isText && rawSize > 0 ? rawSize - 1 : rawSize;
+					outIr.data.resize(copySize);
+					if (copySize > 0)
+						std::memcpy(outIr.data.data(), irs[r].pData, copySize);
+
 					irStr += "---- ";
 					irStr += irs[r].name;
 					irStr += " ----\n";
 					irStr += irs[r].description;
 					irStr += "\n";
-					if (irs[r].isText)
+					if (outIr.isText)
 					{
 						auto* str = static_cast<const char*>(irs[r].pData);
-						irStr.append(str, irs[r].dataSize > 0 ? irs[r].dataSize - 1 : 0);
+						irStr.append(str, copySize);
 						irStr += "\n";
 					}
 					else
 					{
 						irStr += "[binary data, ";
-						irStr += std::to_string(irs[r].dataSize);
+						irStr += std::to_string(rawSize);
 						irStr += " bytes]\n";
 					}
 				}