diff --git a/examples_tests b/examples_tests index 4e7e5dd5f2..5f2a1f7e9d 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 4e7e5dd5f2e707399c155c0a2e5ec009a848d1e4 +Subproject commit 5f2a1f7e9d52fc2112bb24629fff2157b92ce665 diff --git a/include/nbl/builtin/hlsl/algorithm.hlsl b/include/nbl/builtin/hlsl/algorithm.hlsl index 631001686e..d80ebdf443 100644 --- a/include/nbl/builtin/hlsl/algorithm.hlsl +++ b/include/nbl/builtin/hlsl/algorithm.hlsl @@ -142,7 +142,9 @@ struct bound_t void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint, const uint32_t rightBegin) { - if (compare(accessor[testPoint],value)) + typename Accessor::value_type val; + accessor.get(testPoint, val); + if (compare(val,value)) it = rightBegin; } void comp_step(NBL_REF_ARG(Accessor) accessor, const uint32_t testPoint) diff --git a/include/nbl/builtin/hlsl/bit.hlsl b/include/nbl/builtin/hlsl/bit.hlsl index 71075d5491..f712c5aad5 100644 --- a/include/nbl/builtin/hlsl/bit.hlsl +++ b/include/nbl/builtin/hlsl/bit.hlsl @@ -3,6 +3,7 @@ #include +#include #ifndef __HLSL_VERSION @@ -123,5 +124,32 @@ uint16_t countl_zero(T n) } } #endif - + +namespace nbl +{ +namespace hlsl +{ + +// Variable-width sub-word bit rotation +template +NBL_CONSTEXPR_FUNC T rotl(T value, uint32_t bits, uint32_t width) +{ + const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1)); + value &= mask; + bits &= -(bits < width); + return ((value << bits) | (value >> (width - bits))) & mask; +} + +template +NBL_CONSTEXPR_FUNC T rotr(T value, uint32_t bits, uint32_t width) +{ + const T mask = (width >= sizeof(T) * 8) ? ~T(0) : ((T(1) << width) - T(1)); + value &= mask; + bits &= -(bits < width); + return ((value >> bits) | (value << (width - bits))) & mask; +} + +} +} + #endif diff --git a/include/nbl/builtin/hlsl/functional.hlsl b/include/nbl/builtin/hlsl/functional.hlsl index 118fe07c63..6a155f2b92 100644 --- a/include/nbl/builtin/hlsl/functional.hlsl +++ b/include/nbl/builtin/hlsl/functional.hlsl @@ -89,12 +89,23 @@ struct reference_wrapper : enable_if_t< return lhs OP rhs; \ } +#define ALIAS_STD_CMP(NAME,OP) template struct NAME { \ + using type_t = T; \ + \ + bool operator()(NBL_CONST_REF_ARG(T) lhs, NBL_CONST_REF_ARG(T) rhs) \ + { \ + return lhs OP rhs; \ + } + #else // CPP #define ALIAS_STD(NAME,OP) template struct NAME : std::NAME { \ using type_t = T; +#define ALIAS_STD_CMP(NAME,OP) template struct NAME : std::NAME { \ + using type_t = T; + #endif ALIAS_STD(bit_and,&) @@ -136,14 +147,15 @@ ALIAS_STD(divides,/) }; -ALIAS_STD(equal_to, ==) }; -ALIAS_STD(not_equal_to, !=) }; -ALIAS_STD(greater, >) }; -ALIAS_STD(less, <) }; -ALIAS_STD(greater_equal, >=) }; -ALIAS_STD(less_equal, <=) }; +ALIAS_STD_CMP(equal_to, ==) }; +ALIAS_STD_CMP(not_equal_to, !=) }; +ALIAS_STD_CMP(greater, >) }; +ALIAS_STD_CMP(less, <) }; +ALIAS_STD_CMP(greater_equal, >=) }; +ALIAS_STD_CMP(less_equal, <=) }; #undef ALIAS_STD +#undef ALIAS_STD_CMP // The above comparison operators return bool on STD, but in HLSL they're supposed to yield bool vectors, so here's a specialization so that they return `vector` for vectorial types diff --git a/include/nbl/builtin/hlsl/ies/sampler.hlsl b/include/nbl/builtin/hlsl/ies/sampler.hlsl index ab4046477c..a6309fd128 100644 --- a/include/nbl/builtin/hlsl/ies/sampler.hlsl +++ b/include/nbl/builtin/hlsl/ies/sampler.hlsl @@ -85,7 +85,7 @@ struct CandelaSampler const angle_t vAngle = degrees(polar.theta); const angle_t hAngle = degrees(__wrapPhi(polar.phi, symmetry)); -#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } }; +#define NBL_IES_DEF_ANGLE_ACC(T, EXPR) struct T { using value_type = angle_t; accessor_t acc; value_type operator[](uint32_t idx) NBL_CONST_MEMBER_FUNC { return EXPR; } void get(uint32_t idx, NBL_REF_ARG(value_type) val) NBL_CONST_MEMBER_FUNC { val = EXPR; } }; NBL_IES_DEF_ANGLE_ACC(VAcc, acc.vAngle(idx)) NBL_IES_DEF_ANGLE_ACC(HAcc, acc.hAngle(idx)) diff --git a/include/nbl/builtin/hlsl/math/functions.hlsl b/include/nbl/builtin/hlsl/math/functions.hlsl index b5b6f8feea..a80bdda904 100644 --- a/include/nbl/builtin/hlsl/math/functions.hlsl +++ b/include/nbl/builtin/hlsl/math/functions.hlsl @@ -93,13 +93,22 @@ scalar_type_t lpNorm(NBL_CONST_REF_ARG(T) v) // valid only for `theta` in [-PI,PI] -template ) +// UseRealSinCos=true -> back-to-back sin + cos. Saturates the special-function pipeline, enables vendor sincos fusion, full precision near multiples of pi. +// UseRealSinCos=false -> cos + sqrt(1-c*c) with sign recovered from theta. Saves one special-function op when cos alone is cheaper than sin+cos, but suffers catastrophic cancellation as |c| -> 1. +template ) void sincos(T theta, NBL_REF_ARG(T) s, NBL_REF_ARG(T) c) { - s = sin(theta); - c = cos(theta); - // s = sqrt(T(NBL_FP64_LITERAL(1.0))-c*c); - // s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0))); + if (UseRealSinCos) + { + s = sin(theta); + c = cos(theta); + } + else + { + c = cos(theta); + s = sqrt(T(NBL_FP64_LITERAL(1.0))-c*c); + s = ieee754::flipSign(s, theta < T(NBL_FP64_LITERAL(0.0))); + } } template ::Dimension == 3) diff --git a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl index 15742e10f3..8e7b3249a0 100644 --- a/include/nbl/builtin/hlsl/sampling/alias_table.hlsl +++ b/include/nbl/builtin/hlsl/sampling/alias_table.hlsl @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -17,84 +18,187 @@ namespace hlsl namespace sampling { -// Alias Method (Vose/Walker) discrete sampler. -// -// Samples a discrete index in [0, N) with probability proportional to -// precomputed weights in O(1) time per sample, using a prebuilt alias table. -// -// Accessor template parameters must satisfy GenericReadAccessor: -// accessor.template get(index, outVal) // void, writes to outVal -// -// - ProbabilityAccessor: reads scalar_type threshold in [0, 1] for bin i -// - AliasIndexAccessor: reads uint32_t redirect index for bin i -// - PdfAccessor: reads scalar_type weight[i] / totalWeight -// -// Satisfies TractableSampler (not BackwardTractableSampler: the mapping is discrete). -// The cache stores the PDF value looked up during generate, avoiding redundant -// storage of the codomain (sampled index) which is already the return value. -template result = bin +// else -> result = getTarget(word) +// Quantizing the threshold to (32 - Log2N) bits is precision-neutral: `u` +// already consumed Log2N bits of randomness producing `bin`, so `remainder` +// carries exactly that many bits of discriminatory power. +namespace impl +{ +template +struct AliasBitDecoder +{ + static uint32_t getTarget(uint32_t word) + { + return word & ((1u << Log2N) - 1u); + } + template + static T getStayProb(uint32_t word) + { + const uint32_t unormMax = (~0u) >> Log2N; + return T(word >> Log2N) / T(unormMax); + } +}; +} // namespace impl + +// 8 B entry used by the NBig == true variant. Embeds the bin's own pdf +// alongside the packed word so the common stay-case needs no extra tap. +template +struct PackedAliasEntryB +{ + uint32_t packedWord; // low Log2N: redirect target; high 32-Log2N: stayProb unorm + T ownPdf; // pdf of this bin +}; + + +// NBig == false: 4 B packed word per bin + separate pdf[] array. Per sample +// = one 4 B word load + one unconditional 4 B pdf[] tap indexed by the +// selected bin (either the current bin or its redirect). Total 8 B whether +// the sample stays or aliases. Favours small N. +template && - concepts::accessors::GenericReadAccessor && - concepts::accessors::GenericReadAccessor && + concepts::accessors::GenericReadAccessor && concepts::accessors::GenericReadAccessor) -struct AliasTable +struct PackedAliasTableA { using scalar_type = T; - using domain_type = Domain; using codomain_type = Codomain; using density_type = scalar_type; using weight_type = density_type; + using decoder = impl::AliasBitDecoder; + NBL_CONSTEXPR_STATIC_INLINE bool NBig = false; struct cache_type { density_type pdf; }; - static AliasTable create(NBL_CONST_REF_ARG(ProbabilityAccessor) _probAccessor, NBL_CONST_REF_ARG(AliasIndexAccessor) _aliasAccessor, NBL_CONST_REF_ARG(PdfAccessor) _pdfAccessor, codomain_type _size) + static PackedAliasTableA create(NBL_CONST_REF_ARG(PackedWordAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size) { - AliasTable retval; - retval.probAccessor = _probAccessor; - retval.aliasAccessor = _aliasAccessor; - retval.pdfAccessor = _pdfAccessor; - // Precompute tableSize as float minus 1 ULP so that u=1.0 maps to bin N-1 + PackedAliasTableA retval; + retval.entryAcc = _entryAcc; + retval.pdfAcc = _pdfAcc; const scalar_type exact = scalar_type(_size); retval.tableSizeMinusUlp = nbl::hlsl::bit_cast(nbl::hlsl::bit_cast(exact) - 1u); return retval; } - // BasicSampler interface codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC { const scalar_type scaled = u * tableSizeMinusUlp; const codomain_type bin = _static_cast(scaled); const scalar_type remainder = scaled - scalar_type(bin); - scalar_type prob; - probAccessor.template get(bin, prob); - - // Use if-statement to avoid select: aliasIndex is a dependent read - codomain_type result; - if (remainder < prob) - { - result = bin; - } - else - { - codomain_type alias; - aliasAccessor.template get(bin, alias); - result = alias; - } + uint32_t packedWord; + entryAcc.template get(bin, packedWord); + return hlsl::select(remainder < decoder::template getStayProb(packedWord), bin, codomain_type(decoder::getTarget(packedWord))); + } + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const codomain_type result = generate(u); + pdfAcc.template get(result, cache.pdf); return result; } - // TractableSampler interface + density_type forwardPdf(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + return cache.pdf; + } + + weight_type forwardWeight(const domain_type u, NBL_CONST_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + return cache.pdf; + } + + density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC + { + scalar_type pdf; + pdfAcc.template get(v, pdf); + return pdf; + } + + weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC + { + return backwardPdf(v); + } + + PackedWordAccessor entryAcc; + PdfAccessor pdfAcc; + scalar_type tableSizeMinusUlp; +}; + +// NBig == true: 8 B entry {packedWord, ownPdf} + separate pdf[] array. Per +// sample = one 8 B entry load (covers the common stay case where cache +// already has ownPdf). If the sample aliases, a conditional 4 B pdf[target] +// tap fills the cache. Total 8 B stay, 12 B aliased. Favours large N. +template && + concepts::accessors::GenericReadAccessor, Codomain> && + concepts::accessors::GenericReadAccessor) +struct PackedAliasTableB +{ + using scalar_type = T; + using domain_type = Domain; + using codomain_type = Codomain; + using density_type = scalar_type; + using weight_type = density_type; + using entry_type = PackedAliasEntryB; + using decoder = impl::AliasBitDecoder; + NBL_CONSTEXPR_STATIC_INLINE bool NBig = true; + + struct cache_type + { + density_type pdf; + }; + + static PackedAliasTableB create(NBL_CONST_REF_ARG(EntryAccessor) _entryAcc, NBL_CONST_REF_ARG(PdfAccessor) _pdfAcc, codomain_type _size) + { + PackedAliasTableB retval; + retval.entryAcc = _entryAcc; + retval.pdfAcc = _pdfAcc; + const scalar_type exact = scalar_type(_size); + retval.tableSizeMinusUlp = nbl::hlsl::bit_cast(nbl::hlsl::bit_cast(exact) - 1u); + return retval; + } + + codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC + { + const scalar_type scaled = u * tableSizeMinusUlp; + const codomain_type bin = _static_cast(scaled); + const scalar_type remainder = scaled - scalar_type(bin); + + entry_type entry; + entryAcc.template get(bin, entry); + return hlsl::select(remainder < decoder::template getStayProb(entry.packedWord), bin, codomain_type(decoder::getTarget(entry.packedWord))); + } + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { - const codomain_type result = generate(u); - pdfAccessor.template get(result, cache.pdf); + const scalar_type scaled = u * tableSizeMinusUlp; + const codomain_type bin = _static_cast(scaled); + const scalar_type remainder = scaled - scalar_type(bin); + + entry_type entry; + entryAcc.template get(bin, entry); + + const bool stay = remainder < decoder::template getStayProb(entry.packedWord); + + cache.pdf = entry.ownPdf; + codomain_type result = bin; + if (!stay) + { + const codomain_type target = codomain_type(decoder::getTarget(entry.packedWord)); + pdfAcc.template get(target, cache.pdf); + result = target; + } return result; } @@ -111,7 +215,7 @@ struct AliasTable density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC { scalar_type pdf; - pdfAccessor.template get(v, pdf); + pdfAcc.template get(v, pdf); return pdf; } @@ -120,9 +224,8 @@ struct AliasTable return backwardPdf(v); } - ProbabilityAccessor probAccessor; - AliasIndexAccessor aliasAccessor; - PdfAccessor pdfAccessor; + EntryAccessor entryAcc; + PdfAccessor pdfAcc; scalar_type tableSizeMinusUlp; }; diff --git a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h index d02d21488c..2c7c53fd5f 100644 --- a/include/nbl/builtin/hlsl/sampling/alias_table_builder.h +++ b/include/nbl/builtin/hlsl/sampling/alias_table_builder.h @@ -5,7 +5,12 @@ #ifndef _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_ #define _NBL_BUILTIN_HLSL_SAMPLING_ALIAS_TABLE_BUILDER_H_INCLUDED_ +#include #include +#include +#include + +#include namespace nbl { @@ -15,74 +20,139 @@ namespace sampling { // Builds the alias table from an array of non-negative weights. -// All output arrays must be pre-allocated to N entries. // -// Parameters: -// weights - input weights (non-negative, at least one must be > 0) -// N - number of entries -// outProbability - [out] alias table probability threshold per bin, in [0, 1] -// outAlias - [out] alias redirect index per bin -// outPdf - [out] normalized PDF per entry: weight[i] / sum(weights) -// workspace - scratch buffer of N uint32_t entries +// When `weights.size()` is a power of two, the builder transparently appends +// one zero-weight dummy bucket so the GPU-facing table size is N+1 (odd), +// which breaks PoT-periodic address patterns that alias memory channels / +// cache sets on most GPUs. The sampled distribution is unchanged, the dummy +// has stayProb = 0 and always redirects to a real donor. +// +// Output vectors are `resize`d by the builder to the final table size, so +// the caller just passes (possibly empty) vectors and reads back the +// returned size. That returned value is what to pass to the sampler's +// `_size` argument and to use when packing / uploading. template struct AliasTableBuilder { - static void build(std::span weights, T* outProbability, uint32_t* outAlias, T* outPdf, uint32_t* workspace) - { - T totalWeight = T(0); - for (uint32_t i = 0; i < weights.size(); i++) - totalWeight += weights[i]; - - const T rcpTotalWeight = T(1) / totalWeight; - - // Compute PDFs, scaled probabilities, and partition into small/large in one pass - uint32_t smallEnd = 0; - uint32_t largeBegin = weights.size(); - for (uint32_t i = 0; i < weights.size(); i++) - { - outPdf[i] = weights[i] * rcpTotalWeight; - outProbability[i] = outPdf[i] * T(weights.size()); - - if (outProbability[i] < T(1)) - workspace[smallEnd++] = i; - else - workspace[--largeBegin] = i; - } - - // Pair small and large entries - while (smallEnd > 0 && largeBegin < weights.size()) - { - const uint32_t s = workspace[--smallEnd]; - const uint32_t l = workspace[largeBegin]; - - outAlias[s] = l; - // outProbability[s] already holds the correct probability for bin s - - outProbability[l] -= (T(1) - outProbability[s]); - - if (outProbability[l] < T(1)) - { - // l became small: pop from large, push to small - largeBegin++; - workspace[smallEnd++] = l; - } - // else l stays in large (don't pop, reuse next iteration) - } - - // Remaining entries (floating point rounding artifacts) - while (smallEnd > 0) - { - const uint32_t s = workspace[--smallEnd]; - outProbability[s] = T(1); - outAlias[s] = s; - } - while (largeBegin < weights.size()) - { - const uint32_t l = workspace[largeBegin++]; - outProbability[l] = T(1); - outAlias[l] = l; - } - } + // Ugly but much faster: we better ensure the table size is not a power of + // two, so we pad with +1 zero-weight dummy bucket when needed. PoT-sized + // alias tables hit GPU memory channel / cache set aliasing that can be + // wildly (sometimes 2x+) slower than a nearby non-PoT size. Builder owns + // all the sizing (resizes the output vectors, allocates its own scratch), + // so the caller can't get it wrong. + static uint32_t build(std::span weights, std::vector& outProbability, std::vector& outAlias, std::vector& outPdf) + { + const uint32_t userN = static_cast(weights.size()); + const uint32_t tableN = (userN > 1u && (userN & (userN - 1u)) == 0u) ? (userN + 1u) : userN; + + outProbability.resize(tableN); + outAlias.resize(tableN); + outPdf.resize(tableN); + std::vector workspace(tableN); + + T totalWeight = T(0); + for (uint32_t i = 0; i < userN; i++) + totalWeight += weights[i]; + + const T rcpTotalWeight = T(1) / totalWeight; + + // Compute PDFs, scaled probabilities, and partition into small/large in one pass + uint32_t smallEnd = 0u; + uint32_t largeBegin = tableN; + for (uint32_t i = 0; i < userN; i++) + { + outPdf[i] = weights[i] * rcpTotalWeight; + outProbability[i] = outPdf[i] * T(tableN); + + if (outProbability[i] < T(1)) + workspace[smallEnd++] = i; + else + workspace[--largeBegin] = i; + } + // PoT dodge tail: one zero-weight dummy at index userN, always in the small list. + if (tableN != userN) + { + outPdf[userN] = T(0); + outProbability[userN] = T(0); + workspace[smallEnd++] = userN; + } + + // Pair small and large entries + while (smallEnd > 0u && largeBegin < tableN) + { + const uint32_t s = workspace[--smallEnd]; + const uint32_t l = workspace[largeBegin]; + + outAlias[s] = l; + // outProbability[s] already holds the correct probability for bin s + + outProbability[l] -= (T(1) - outProbability[s]); + + if (outProbability[l] < T(1)) + { + // l became small: pop from large, push to small + largeBegin++; + workspace[smallEnd++] = l; + } + // else l stays in large (don't pop, reuse next iteration) + } + + // Remaining entries (floating point rounding artifacts) + while (smallEnd > 0u) + { + const uint32_t s = workspace[--smallEnd]; + outProbability[s] = T(1); + outAlias[s] = s; + } + while (largeBegin < tableN) + { + const uint32_t l = workspace[largeBegin++]; + outProbability[l] = T(1); + outAlias[l] = l; + } + + return tableN; + } + + // Pack (target, stayProb) into a single 32-bit word with Log2N bits for + // target and (32 - Log2N) bits for the unorm-quantized threshold. Used by + // every packed variant; each packX() below calls this on a per-entry basis. + template + static uint32_t packWord(uint32_t target, T stayProb) + { + const uint32_t targetMask = (Log2N == 32u) ? ~0u : ((1u << Log2N) - 1u); + const T clamped = stayProb < T(0) ? T(0) : (stayProb > T(1) ? T(1) : stayProb); + const uint32_t unormMax = (Log2N == 0u) ? ~0u : ((~0u) >> Log2N); + const uint32_t probUnorm = static_cast(std::round(static_cast(clamped) * static_cast(unormMax))); + return (target & targetMask) | (probUnorm << Log2N); + } + + // Variant A, pack SoA outputs into an array of 4 B packed words. The + // pdf[] array is consumed directly by the sampler as a second accessor. + // outWords must be pre-allocated to N uint32_t entries. + template + static void packA(std::span probability, std::span alias, uint32_t* outWords) + { + const uint32_t N = static_cast(probability.size()); + for (uint32_t i = 0; i < N; i++) + outWords[i] = packWord(alias[i], probability[i]); + } + + // Variant B, pack SoA outputs into 8 B entries { packedWord, ownPdf }. + // The pdf[] array is *also* passed to the sampler (same contents as ownPdf + // column, but tapped independently with a 4 B fetch when the sample aliases). + // outEntries must be pre-allocated to N entries. + template + static void packB(std::span probability, std::span alias, std::span pdf, + PackedAliasEntryB* outEntries) + { + const uint32_t N = static_cast(probability.size()); + for (uint32_t i = 0; i < N; i++) + { + outEntries[i].packedWord = packWord(alias[i], probability[i]); + outEntries[i].ownPdf = pdf[i]; + } + } }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl index 35f1391930..b56073074b 100644 --- a/include/nbl/builtin/hlsl/sampling/bilinear.hlsl +++ b/include/nbl/builtin/hlsl/sampling/bilinear.hlsl @@ -45,10 +45,10 @@ struct Bilinear vector2_type twiceAreasUnderXCurve = vector2_type(bilinearCoeffs[0] + bilinearCoeffs[1], bilinearCoeffs[2] + bilinearCoeffs[3]); // Linear::create adds FLT_MIN internally, replicate here so both divisions share // the same denominator (sum + 2*min), enabling CSE to merge them into one division - const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + scalar_type(2.0) * hlsl::numeric_limits::min; - const scalar_type yNormFactor = scalar_type(2.0) / safeSum; + const scalar_type safeSum = twiceAreasUnderXCurve[0] + twiceAreasUnderXCurve[1] + _static_cast(2.0) * hlsl::numeric_limits::min; + const scalar_type yNormFactor = _static_cast(2.0) / safeSum; retval.lineary = Linear::create(twiceAreasUnderXCurve); - retval.normFactor = yNormFactor * scalar_type(2.0); + retval.normFactor = yNormFactor * _static_cast(2.0); return retval; } @@ -65,7 +65,7 @@ struct Bilinear // bilinear PDF = marginal_y_pdf * conditional_x_pdf; reuse both linear caches const scalar_type yPdf = lineary.forwardPdf(u.y, linearYCache); - cache.normalizedStart = yPdf * linearx.linearCoeffStart; + cache.normalizedStart = yPdf * linearx.normalizedCoeffStart; cache.linearXCache.diffTimesX *= yPdf; return p; } diff --git a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl index b9efd1ec35..9ae0c83c05 100644 --- a/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl +++ b/include/nbl/builtin/hlsl/sampling/box_muller_transform.hlsl @@ -37,16 +37,16 @@ struct BoxMullerTransform { BoxMullerTransform retval; retval.stddev = _stddev; - retval.halfRcpStddev2 = scalar_type(0.5) / (_stddev * _stddev); + retval.halfRcpStddev2 = _static_cast(0.5) / (_stddev * _stddev); return retval; } codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { scalar_type sinPhi, cosPhi; - math::sincos(scalar_type(2.0) * numbers::pi * u.y - numbers::pi, sinPhi, cosPhi); + math::sincos(_static_cast(2.0) * numbers::pi * u.y - numbers::pi, sinPhi, cosPhi); cache.direction = vector2_type(cosPhi, sinPhi); - return cache.direction * nbl::hlsl::sqrt(scalar_type(-2.0) * nbl::hlsl::log(u.x)) * stddev; + return cache.direction * nbl::hlsl::sqrt(_static_cast(-2.0) * nbl::hlsl::log(u.x)) * stddev; } density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC diff --git a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl index c9e5cac5d6..d31c2d994a 100644 --- a/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl +++ b/include/nbl/builtin/hlsl/sampling/concentric_mapping.hlsl @@ -41,7 +41,7 @@ struct ConcentricMapping static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) { // map [0,1]^2 to [-1,1]^2 - const vector2_type centered = scalar_type(2) * u - hlsl::promote(scalar_type(1)); + const vector2_type centered = _static_cast(2) * u - hlsl::promote(_static_cast(1)); const scalar_type a = centered.x; const scalar_type b = centered.y; @@ -51,10 +51,10 @@ struct ConcentricMapping const scalar_type dominant = hlsl::select(cond, a, b); const scalar_type minor = hlsl::select(cond, b, a); - const scalar_type safe_dominant = dominant != scalar_type(0) ? dominant : scalar_type(0); + const scalar_type safe_dominant = dominant != _static_cast(0) ? dominant : _static_cast(0); const scalar_type ratio = minor / safe_dominant; - const scalar_type angle = scalar_type(0.25) * numbers::pi * ratio; + const scalar_type angle = _static_cast(0.25) * numbers::pi * ratio; const scalar_type c = hlsl::cos(angle); const scalar_type s = hlsl::sin(angle); @@ -90,7 +90,7 @@ struct ConcentricMapping // angle in [0, pi/4] const scalar_type phi = hlsl::atan2(num, denom); - const scalar_type minor_val = r * phi / (scalar_type(0.25) * numbers::pi); + const scalar_type minor_val = r * phi / (_static_cast(0.25) * numbers::pi); // reconstruct a,b using select instead of branching const scalar_type a_base = hlsl::select(swapped, minor_val, r); @@ -99,7 +99,7 @@ struct ConcentricMapping const scalar_type a = ieee754::copySign(a_base, p.x); const scalar_type b = ieee754::copySign(b_base, p.y); - return (vector2_type(a, b) + hlsl::promote(scalar_type(1))) * scalar_type(0.5); + return (vector2_type(a, b) + hlsl::promote(_static_cast(1))) * _static_cast(0.5); } // The PDF of Shirley mapping is constant (1/PI on the unit disk) diff --git a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl index 23e35e2f7d..52b063f448 100644 --- a/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl +++ b/include/nbl/builtin/hlsl/sampling/cos_weighted_spheres.hlsl @@ -35,8 +35,8 @@ struct ProjectedHemisphere static codomain_type __generate(const domain_type u) { - vector_t2 p = ConcentricMapping::generate(u * T(0.99999) + T(0.000005)); - T z = hlsl::sqrt(hlsl::max(T(0.0), T(1.0) - p.x * p.x - p.y * p.y)); + vector_t2 p = ConcentricMapping::generate(u * _static_cast(0.99999) + _static_cast(0.000005)); + T z = hlsl::sqrt(hlsl::max(_static_cast(0.0), _static_cast(1.0) - p.x * p.x - p.y * p.y)); return vector_t3(p.x, p.y, z); } @@ -93,11 +93,11 @@ struct ProjectedSphere static codomain_type __generate(NBL_REF_ARG(domain_type) u) { vector_t3 retval = hemisphere_t::__generate(u.xy); - const bool chooseLower = u.z > T(0.5); + const bool chooseLower = u.z > _static_cast(0.5); retval.z = chooseLower ? (-retval.z) : retval.z; if (chooseLower) - u.z -= T(0.5); - u.z *= T(2.0); + u.z -= _static_cast(0.5); + u.z *= _static_cast(2.0); return retval; } @@ -110,7 +110,7 @@ struct ProjectedSphere static density_type forwardPdf(const domain_type u, const cache_type cache) { - return T(0.5) * cache.z * numbers::inv_pi; + return _static_cast(0.5) * cache.z * numbers::inv_pi; } static weight_type forwardWeight(const domain_type u, const cache_type cache) @@ -120,7 +120,7 @@ struct ProjectedSphere static density_type backwardPdf(const codomain_type L) { - return T(0.5) * hlsl::abs(L.z) * numbers::inv_pi; + return _static_cast(0.5) * hlsl::abs(L.z) * numbers::inv_pi; } static weight_type backwardWeight(const codomain_type L) diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl index 1f176f9713..d3657babb1 100644 --- a/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl +++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability.hlsl @@ -16,18 +16,38 @@ namespace hlsl namespace sampling { -// Discrete sampler using cumulative probability lookup via upper_bound. +// Discrete sampler using cumulative probability lookup. // // Samples a discrete index in [0, N) with probability proportional to // precomputed weights in O(log N) time per sample. // -// The cumulative probability array stores N-1 entries (the last bucket -// is always 1.0 and need not be stored). Entry i holds the sum of -// probabilities for indices [0, i]. +// Three layouts / cache-population strategies, selected by the Mode parameter: +// +// TRACKING (default): N-1 CDF entries, last bucket implicit at 1.0. +// A stateful comparator records the straddling CDF +// values during upper_bound itself. +// YOLO: Same storage. Plain upper_bound followed by two +// re-reads of the adjacent CDF entries (warm cache). +// Lower register footprint, two extra array reads. +// EYTZINGER: Level-order implicit binary tree in 2*P entries +// where P = roundUpPot(N). Leaves at [P, P+N) hold +// the CDF; interior nodes at [1, P) hold split keys. +// Descent reads adjacent memory at each step, so +// every cache line pulled is fully utilised and the +// first log2(subgroupSize) iterations are served by a +// single transaction per subgroup. Build with +// sampling::buildEytzinger(). // // Satisfies TractableSampler and ResamplableSampler (not BackwardTractableSampler: // the mapping is discrete). -template) struct CumulativeProbabilitySampler { @@ -44,58 +64,116 @@ struct CumulativeProbabilitySampler density_type upperBound; }; + // `_size` is the user-facing bucket count N for every mode. TRACKING / YOLO + // expect the accessor to hold N-1 CDF entries; EYTZINGER expects 2*P entries + // in the level-order layout produced by buildEytzinger. static CumulativeProbabilitySampler create(NBL_CONST_REF_ARG(CumProbAccessor) _cumProbAccessor, uint32_t _size) { CumulativeProbabilitySampler retval; retval.cumProbAccessor = _cumProbAccessor; retval.storedCount = _size - 1u; + retval.depth = 0u; + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) + { + uint32_t P = 1u; + uint32_t d = 0u; + while (P < _size) { P <<= 1u; ++d; } + retval.depth = d; + } return retval; } // BasicSampler interface codomain_type generate(const domain_type u) NBL_CONST_MEMBER_FUNC { - // upper_bound returns first index where cumProb > u - return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) + { + const uint32_t leafBase = 1u << depth; + uint32_t index = 1u; + for (uint32_t iter = 0u; iter < depth; ++iter) + { + density_type key; + cumProbAccessor.template get(index, key); + index = (index << 1u) | uint32_t(!(u < key)); + } + const codomain_type result = codomain_type(index - leafBase); + return result < codomain_type(storedCount) ? result : codomain_type(storedCount); + } + else + { + return hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); + } } // TractableSampler interface codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { - // #define NBL_CUMPROB_YOLO_READS -#ifdef NBL_CUMPROB_YOLO_READS - // YOLO approach: re-read the array after binary search. - // The accessed elements are adjacent to the found index so the cache is warm. - const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); - cache.oneBefore = density_type(0.0); - if (result) - cumProbAccessor.template get(result - 1u, cache.oneBefore); - cache.upperBound = density_type(1.0); - if (result < storedCount) - cumProbAccessor.template get(result, cache.upperBound); -#else - // Tracking reads approach: stateful comparator captures CDF values during binary search. - struct CdfComparator + codomain_type result; + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) { - bool operator()(const density_type value, const density_type rhs) + // Descent visits one interior node per level. Going left tightens + // the upper bound to the current key; going right tightens the + // lower bound. Final index, leafBase is the bucket. + cache.oneBefore = _static_cast(0.0); + cache.upperBound = _static_cast(1.0); + const uint32_t leafBase = 1u << depth; + uint32_t index = 1u; + for (uint32_t iter = 0u; iter < depth; ++iter) { - const bool retval = value < rhs; - if (retval) - upperBound = rhs; + density_type key; + cumProbAccessor.template get(index, key); + const bool goRight = !(u < key); + if (goRight) + { + cache.oneBefore = key; + index = (index << 1u) | 1u; + } else - oneBefore = rhs; - return retval; + { + cache.upperBound = key; + index = (index << 1u); + } } - - density_type oneBefore; - density_type upperBound; - } comp; - comp.oneBefore = density_type(0.0); - comp.upperBound = density_type(1.0); - const codomain_type result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp); - cache.oneBefore = comp.oneBefore; - cache.upperBound = comp.upperBound; -#endif + const codomain_type raw = codomain_type(index - leafBase); + result = raw < codomain_type(storedCount) ? raw : codomain_type(storedCount); + } + else NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::YOLO) + { + // Re-read the two adjacent CDF entries after the binary search. + // Both sit on the cache lines the search just touched, so they are warm. + result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u); + cache.oneBefore = _static_cast(0.0); + if (result) + cumProbAccessor.template get(result - 1u, cache.oneBefore); + cache.upperBound = _static_cast(1.0); + if (result < storedCount) + cumProbAccessor.template get(result, cache.upperBound); + } + else + { + // TRACKING: stateful comparator captures the CDF values straddling the + // found index during the binary search itself, avoiding the two extra reads. + struct CdfComparator + { + bool operator()(const density_type value, const density_type rhs) + { + const bool retval = value < rhs; + if (retval) + upperBound = rhs; + else + oneBefore = rhs; + return retval; + } + + density_type oneBefore; + density_type upperBound; + } comp; + comp.oneBefore = _static_cast(0.0); + comp.upperBound = _static_cast(1.0); + result = hlsl::upper_bound(cumProbAccessor, 0u, storedCount, u, comp); + cache.oneBefore = comp.oneBefore; + cache.upperBound = comp.upperBound; + } return result; } @@ -111,16 +189,34 @@ struct CumulativeProbabilitySampler density_type backwardPdf(const codomain_type v) NBL_CONST_MEMBER_FUNC { - density_type retval = density_type(1.0); - if (v < storedCount) - cumProbAccessor.template get(v, retval); - if (v) + NBL_IF_CONSTEXPR(Mode == CumulativeProbabilityMode::EYTZINGER) { - density_type prev; - cumProbAccessor.template get(v - 1u, prev); - retval -= prev; + // Leaves store the CDF directly; the last real leaf is normalized + // to 1.0 and padded leaves (if any) also hold 1.0. + const uint32_t leafBase = 1u << depth; + density_type retval; + cumProbAccessor.template get(leafBase + uint32_t(v), retval); + if (v) + { + density_type prev; + cumProbAccessor.template get(leafBase + uint32_t(v) - 1u, prev); + retval -= prev; + } + return retval; + } + else + { + density_type retval = _static_cast(1.0); + if (v < storedCount) + cumProbAccessor.template get(v, retval); + if (v) + { + density_type prev; + cumProbAccessor.template get(v - 1u, prev); + retval -= prev; + } + return retval; } - return retval; } weight_type backwardWeight(const codomain_type v) NBL_CONST_MEMBER_FUNC @@ -129,7 +225,8 @@ struct CumulativeProbabilitySampler } CumProbAccessor cumProbAccessor; - uint32_t storedCount; + uint32_t storedCount; // N - 1 (last real bucket index) + uint32_t depth; // EYTZINGER only: ceil(log2(N)), iteration count; leafBase = 1 << depth }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h index a511fc2d8c..bf98d5ec93 100644 --- a/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h +++ b/include/nbl/builtin/hlsl/sampling/cumulative_probability_builder.h @@ -30,6 +30,61 @@ void computeNormalizedCumulativeHistogram(std::span weights, T* outCumP std::for_each(outCumProb, outCumProb + N - 1, [normalizationFactor](T& v) { v *= normalizationFactor; }); } +// Returns the next power of two >= x (and 1 for x <= 1). Matches the leaf-count +// the Eytzinger builder pads to. +inline uint32_t eytzingerLeafCount(uint32_t N) +{ + uint32_t P = 1u; + while (P < N) P <<= 1u; + return P; +} + +// Builds an Eytzinger-layout CDF for cache-friendly binary search on the GPU. +// +// Layout (1-indexed, size 2*P where P = eytzingerLeafCount(N)): +// tree[0] unused (keeps parent/child arithmetic branch-free) +// tree[1 .. P-1] interior split keys; tree[v] == rightmost leaf of v's left subtree +// tree[P .. P+N-1] leaves, tree[P + i] = normalized inclusive scan of weights up to i +// tree[P+N .. 2P-1] padded leaves, all 1.0 (any u < 1.0 routes away from these) +// +// The sampler walks the tree as index = (index << 1) | goRight for ceil(log2(N)) +// iterations. Successive taps within one search land on adjacent memory, so every +// cache line pulled is fully used and the first log2(subgroupSize) iterations are +// served by a single memory transaction per subgroup. +template +void buildEytzinger(std::span weights, T* outTree) +{ + const uint32_t N = static_cast(weights.size()); + if (N == 0) + return; + + const uint32_t P = eytzingerLeafCount(N); + + T total = T(0); + for (uint32_t i = 0; i < N; ++i) + total += weights[i]; + const T rcpTotal = T(1) / total; + + T acc = T(0); + for (uint32_t i = 0; i < N; ++i) + { + acc += weights[i]; + outTree[P + i] = acc * rcpTotal; + } + for (uint32_t i = N; i < P; ++i) + outTree[P + i] = T(1); + + // Bottom-up: each interior node copies the rightmost leaf of its left subtree, + // found by descending left-then-always-right from v. + for (uint32_t v = P; v-- > 1u;) + { + uint32_t r = v << 1u; + while (r < P) + r = (r << 1u) | 1u; + outTree[v] = outTree[r]; + } +} + } // namespace sampling } // namespace hlsl } // namespace nbl diff --git a/include/nbl/builtin/hlsl/sampling/linear.hlsl b/include/nbl/builtin/hlsl/sampling/linear.hlsl index 12602b4a79..cd260cea3a 100644 --- a/include/nbl/builtin/hlsl/sampling/linear.hlsl +++ b/include/nbl/builtin/hlsl/sampling/linear.hlsl @@ -39,14 +39,14 @@ struct Linear // add min to both coefficients so (0,0) input produces a valid uniform sampler // instead of inf normalization (2/0) leading to NaN; negligible for normal inputs const vector2_type safeCoeffs = linearCoeffs + vector2_type(hlsl::numeric_limits::min, hlsl::numeric_limits::min); - // normalize coefficients so that the PDF is simply linearCoeffStart + linearCoeffDiff * x - const scalar_type normFactor = scalar_type(2.0) / (safeCoeffs[0] + safeCoeffs[1]); + // normalize coefficients so that the PDF is simply normalizedCoeffStart + linearCoeffDiff * x + const scalar_type normFactor = _static_cast(2.0) / (safeCoeffs[0] + safeCoeffs[1]); const vector2_type normalized = safeCoeffs * normFactor; - retval.linearCoeffStart = normalized[0]; - retval.linearCoeffEnd = normalized[1]; + retval.normalizedCoeffStart = normalized[0]; + retval.normalizedCoeffEnd = normalized[1]; // precompute for the stable quadratic in generate() retval.squaredCoeffStart = normalized[0] * normalized[0]; - retval.twoTimesDiff = scalar_type(2.0) * (normalized[1] - normalized[0]); + retval.twoTimesDiff = _static_cast(2.0) * (normalized[1] - normalized[0]); return retval; } @@ -57,18 +57,18 @@ struct Linear // Quadratic (1-start)*x^2 + start*x - u = 0; since start >= 0 the stable root is // x = 2u / (start + sqrt(start^2 + 2*diff*u)), which never cancels. const scalar_type sqrtTerm = sqrt(squaredCoeffStart + twoTimesDiff * u); - const scalar_type denom = linearCoeffStart + sqrtTerm; + const scalar_type denom = normalizedCoeffStart + sqrtTerm; // NOTE: floating point can make x slightly > 1 when u~1 and diff < 0; callers needing // non-negative PDF at the boundary should clamp with min(x, 1). const codomain_type x = (u + u) / denom; // diff*x == sqrtTerm - start algebraically (conjugate identity), saves 1 mul - cache.diffTimesX = sqrtTerm - linearCoeffStart; + cache.diffTimesX = sqrtTerm - normalizedCoeffStart; return x; } density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC { - return linearCoeffStart + cache.diffTimesX; + return normalizedCoeffStart + cache.diffTimesX; } weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC @@ -82,8 +82,8 @@ struct Linear // Not used because we already store start for generate(). density_type backwardPdf(const codomain_type x) NBL_CONST_MEMBER_FUNC { - assert(x >= scalar_type(0.0) && x <= scalar_type(1.0)); - return hlsl::mix(linearCoeffStart, linearCoeffEnd, x); + assert(x >= _static_cast(0.0) && x <= _static_cast(1.0)); + return hlsl::mix(normalizedCoeffStart, normalizedCoeffEnd, x); } weight_type backwardWeight(const codomain_type x) NBL_CONST_MEMBER_FUNC @@ -91,8 +91,8 @@ struct Linear return backwardPdf(x); } - scalar_type linearCoeffStart; - scalar_type linearCoeffEnd; + scalar_type normalizedCoeffStart; + scalar_type normalizedCoeffEnd; scalar_type squaredCoeffStart; scalar_type twoTimesDiff; }; diff --git a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl index 719eaf504f..64d5e69b96 100644 --- a/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl +++ b/include/nbl/builtin/hlsl/sampling/polar_mapping.hlsl @@ -32,7 +32,7 @@ struct PolarMapping static codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) { const scalar_type r = hlsl::sqrt(u.x); - const scalar_type phi = scalar_type(2) * numbers::pi * u.y; + const scalar_type phi = _static_cast(2) * numbers::pi * u.y; return vector2_type(r * hlsl::cos(phi), r * hlsl::sin(phi)); } @@ -46,8 +46,8 @@ struct PolarMapping { const scalar_type r2 = p.x * p.x + p.y * p.y; scalar_type phi = hlsl::atan2(p.y, p.x); - phi += hlsl::mix(scalar_type(0), scalar_type(2) * numbers::pi, phi < scalar_type(0)); - return vector2_type(r2, phi * (scalar_type(0.5) * numbers::inv_pi)); + phi += hlsl::mix(_static_cast(0), _static_cast(2) * numbers::pi, phi < _static_cast(0)); + return vector2_type(r2, phi * (_static_cast(0.5) * numbers::inv_pi)); } static density_type forwardPdf(const domain_type u, cache_type cache) { return numbers::inv_pi; } diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl index 5bf652cb4c..1ad8b5462e 100644 --- a/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_rectangle.hlsl @@ -25,147 +25,166 @@ namespace sampling // 2. Warp uniform [0,1]^2 through the bilinear to importance-sample NdotL // 3. Feed the warped UV into the solid angle sampler to get a rect offset // 4. PDF = (1/SolidAngle) * bilinearPdf -// -// Template parameter `UsePdfAsWeight`: when true (default), forwardWeight/backwardWeight -// return the PDF instead of the projected-solid-angle MIS weight. -// TODO: the projected-solid-angle MIS weight (UsePdfAsWeight=false) has been shown to be -// poor in practice. Once confirmed by testing, remove the false path and stop storing -// receiverNormal, receiverWasBSDF, and rcpProjSolidAngle as members. template struct ProjectedSphericalRectangle { - using scalar_type = T; - using vector2_type = vector; - using vector3_type = vector; - using vector4_type = vector; - - // BackwardTractableSampler concept types - using domain_type = vector2_type; - using codomain_type = vector3_type; - using density_type = scalar_type; - using weight_type = density_type; - - struct cache_type - { - scalar_type abs_cos_theta; - vector2_type warped; - typename Bilinear::cache_type bilinearCache; - }; - - // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away - // from all four rectangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure - // at least one vertex has positive projection onto the receiver normal. - static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF) - { - ProjectedSphericalRectangle retval; - const vector3_type n = hlsl::mul(shape.basis, _receiverNormal); - retval.localReceiverNormal = n; - retval.receiverWasBSDF = _receiverWasBSDF; - - // Compute solid angle and get r0 in local frame (before z-flip) - const typename shapes::SphericalRectangle::solid_angle_type sa = shape.solidAngle(observer); - const vector3_type r0 = sa.r0; - - // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey - const scalar_type r1x = r0.x + shape.extents.x; - const scalar_type r1y = r0.y + shape.extents.y; - - // Unnormalized dots: dot(corner_i, n) - const scalar_type base_dot = hlsl::dot(r0, n); - const scalar_type dx = shape.extents.x * n.x; - const scalar_type dy = shape.extents.y * n.y; - const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy); - - // Squared lengths of each corner - const scalar_type r0zSq = r0.z * r0.z; - const vector4_type lenSq = vector4_type( - r0.x * r0.x + r0.y * r0.y, - r1x * r1x + r0.y * r0.y, - r0.x * r0.x + r1y * r1y, - r1x * r1x + r1y * r1y - ) + hlsl::promote(r0zSq); - - // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq) - // Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11 - const scalar_type minimumProjSolidAngle = 0.0; - const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, - dots * hlsl::rsqrt(lenSq), - hlsl::promote(minimumProjSolidAngle)); - retval.bilinearPatch = Bilinear::create(bxdfPdfAtVertex); - - // Reuse the already-computed solid_angle_type to avoid recomputing mul(basis, origin - observer) - retval.sphrect = SphericalRectangle::create(sa, shape.extents); - retval.rcpSolidAngle = retval.sphrect.solidAngle > scalar_type(0.0) ? scalar_type(1.0) / retval.sphrect.solidAngle : scalar_type(0.0); - - NBL_IF_CONSTEXPR(!UsePdfAsWeight) - { - const scalar_type projSA = shape.projectedSolidAngleFromLocal(r0, n); - retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0); - } - return retval; - } - - // returns a normalized 3D direction in the local frame - codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - Bilinear bilinear = bilinearPatch; - cache.warped = bilinear.generate(u, cache.bilinearCache); - typename SphericalRectangle::cache_type sphrectCache; - const vector3_type dir = sphrect.generate(cache.warped, sphrectCache); - cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache); - return dir; - } - - // returns a 2D offset on the rectangle surface from the r0 corner - vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - Bilinear bilinear = bilinearPatch; - cache.warped = bilinear.generate(u, cache.bilinearCache); - typename SphericalRectangle::cache_type sphrectCache; - const vector2_type sampleOffset = sphrect.generateSurfaceOffset(cache.warped, sphrectCache); - cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache); - return sampleOffset; - } - - density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache); - } - - weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - if (UsePdfAsWeight) - return forwardPdf(u, cache); - return cache.abs_cos_theta * rcpProjSolidAngle; - } - - // `p` is the normalized [0,1]^2 position on the rectangle - density_type backwardPdf(const vector2_type p) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle * bilinearPatch.backwardPdf(p); - } - - weight_type backwardWeight(const vector2_type p) NBL_CONST_MEMBER_FUNC - { - NBL_IF_CONSTEXPR(UsePdfAsWeight) - return backwardPdf(p); - const scalar_type minimumProjSolidAngle = 0.0; - // Reconstruct local direction from normalized rect position - const vector3_type localDir = hlsl::normalize(sphrect.r0 + vector3_type( - p.x * sphrect.extents.x, - p.y * sphrect.extents.y, - scalar_type(0) - )); - const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(localReceiverNormal, localDir), minimumProjSolidAngle); - return abs_cos_theta * rcpProjSolidAngle; - } - - sampling::SphericalRectangle sphrect; - Bilinear bilinearPatch; - scalar_type rcpSolidAngle; - scalar_type rcpProjSolidAngle; - vector3_type localReceiverNormal; - bool receiverWasBSDF; + using scalar_type = T; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + + // BackwardTractableSampler concept types + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + struct cache_type + { + typename Bilinear::cache_type bilinearCache; + vector3_type L; // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false + }; + + // Shared finalization for both create() overloads: builds the bilinear patch, the inner sphrect + // sampler, and the UsePdfAsWeight=false extras. The two overloads differ only in how they + // compute bxdfPdfAtVertex (worldspace corner normalizations vs local-frame rsqrt(lenSq)). + static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) shape, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle::solid_angle_type) sa, + const vector4_type bxdfPdfAtVertex, const vector3_type _receiverNormal) + { + ProjectedSphericalRectangle retval; + retval.bilinearPatch = Bilinear::create(bxdfPdfAtVertex); + // Reuse solid_angle_type to avoid recomputing mul(basis, origin - observer) + retval.sphrect = SphericalRectangle::create(shape.basis, sa, shape.extents); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + { + retval.receiverNormal = _receiverNormal; + const vector3_type nLocal = hlsl::mul(shape.basis, _receiverNormal); + retval.projSolidAngle = shape.projectedSolidAngleFromLocal(sa.r0, nLocal); + } + return retval; + } + + // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler + static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::CompressedSphericalRectangle) compressed, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF) + { + // 4 normalized worldspace corners dotted with the worldspace receiver normal. Avoids the + // mul(basis, receiverNormal) data dependency of the uncompressed overload so these 4 + // normalize+dot chains can pipeline alongside the basis/solid-angle work below. + const vector3_type c0 = compressed.origin - observer; + const vector3_type c1 = c0 + compressed.right; + const vector3_type c2 = c0 + compressed.up; + const vector3_type c3 = c1 + compressed.up; + const vector4_type dots = vector4_type( + hlsl::dot(hlsl::normalize(c0), _receiverNormal), + hlsl::dot(hlsl::normalize(c1), _receiverNormal), + hlsl::dot(hlsl::normalize(c2), _receiverNormal), + hlsl::dot(hlsl::normalize(c3), _receiverNormal)); + const scalar_type minimumProjSolidAngle = _static_cast(0.0); + const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, dots, hlsl::promote(minimumProjSolidAngle)); + + const shapes::SphericalRectangle shape = shapes::SphericalRectangle::create(compressed); + const typename shapes::SphericalRectangle::solid_angle_type sa = shape.solidAngle(observer); + return create(shape, sa, bxdfPdfAtVertex, _receiverNormal); + } + + // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler + static ProjectedSphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) shape, const vector3_type observer, const vector3_type _receiverNormal, const bool _receiverWasBSDF) + { + // Local-frame path: unnormalized dot(corner_i, n) with n = basis * receiverNormal, then + // a single rsqrt(lenSq) for all 4 corner normalizations at once. + const vector3_type n = hlsl::mul(shape.basis, _receiverNormal); + const typename shapes::SphericalRectangle::solid_angle_type sa = shape.solidAngle(observer); + const vector3_type r0 = sa.r0; + + // All 4 corners share r0.z; x is r0.x or r0.x+ex, y is r0.y or r0.y+ey + const scalar_type r1x = r0.x + shape.extents.x; + const scalar_type r1y = r0.y + shape.extents.y; + const scalar_type base_dot = hlsl::dot(r0, n); + const scalar_type dx = shape.extents.x * n.x; + const scalar_type dy = shape.extents.y * n.y; + const vector4_type dots = vector4_type(base_dot, base_dot + dx, base_dot + dy, base_dot + dx + dy); + + const scalar_type r0zSq = r0.z * r0.z; + const vector4_type lenSq = vector4_type( + r0.x * r0.x + r0.y * r0.y, + r1x * r1x + r0.y * r0.y, + r0.x * r0.x + r1y * r1y, + r1x * r1x + r1y * r1y) + + hlsl::promote(r0zSq); + + // dot(normalize(corner), n) = dot(corner, n) * rsqrt(lenSq). Bilinear corners: [0]=v00 [1]=v10 [2]=v01 [3]=v11 + const scalar_type minimumProjSolidAngle = _static_cast(0.0); + const vector4_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, + dots * hlsl::rsqrt(lenSq), + hlsl::promote(minimumProjSolidAngle)); + + return create(shape, sa, bxdfPdfAtVertex, _receiverNormal); + } + + // returns a normalized 3D direction in the local frame + codomain_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC + { + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalRectangle::cache_type sphrectCache; // there's nothing in the cache + const vector3_type dir = sphrect.generateNormalizedLocal(warped, sphrectCache, hitDist); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = hlsl::mul(hlsl::transpose(sphrect.basis), dir); + return dir; + } + + // returns a unnormalized 3D direction in the global frame + codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalRectangle::cache_type sphrectCache; // there's nothing in the cache + const vector3_type dir = sphrect.generateUnnormalized(warped, sphrectCache); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = dir * hlsl::rsqrt(hlsl::dot(dir, dir)); + return dir; + } + + // returns a normalized 3D direction in the global frame + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalRectangle::cache_type sphrectCache; // there's nothing in the cache + const vector3_type dir = sphrect.generate(warped, sphrectCache); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = dir; + return dir; + } + + density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return bilinearPatch.forwardPdf(u, cache.bilinearCache) / sphrect.solidAngle; + } + + weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + NBL_IF_CONSTEXPR(UsePdfAsWeight) + return forwardPdf(u, cache); + return backwardWeight(cache.L); + } + + weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC + { + NBL_IF_CONSTEXPR(UsePdfAsWeight) + { +#if 0 + const vector2_type warped = sphrect.generateInvese(L); // TODO: implement `generateInverse` + return bilinearPatch.backwardPdf(warped) / sphrect.solidAngle; +#endif + return hlsl::numeric_limits::quiet_NaN; + } + // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used + return hlsl::abs(hlsl::dot(L, receiverNormal)) / projSolidAngle; + } + + sampling::SphericalRectangle sphrect; + Bilinear bilinearPatch; + // TODO: same as projected triangle w.r.t. UsePdfAsWeight==false + vector3_type receiverNormal; + scalar_type projSolidAngle; }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl index c4bc5fcea8..6eddd03e8a 100644 --- a/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/projected_spherical_triangle.hlsl @@ -47,73 +47,68 @@ struct ProjectedSphericalTriangle struct cache_type { - scalar_type abs_cos_theta; - vector2_type warped; typename Bilinear::cache_type bilinearCache; + vector3_type L; // TODO: erase when UsePdfAsWeight==false }; - // NOTE: produces a degenerate (all-zero) bilinear patch when the receiver normal faces away - // from all three triangle vertices, resulting in NaN PDFs (0 * inf). Callers must ensure - // at least one vertex has positive projection onto the receiver normal. + // Shouldn't produce NAN if all corners have 0 proj solid angle due to min density adds/clamps in the linear sampler static ProjectedSphericalTriangle create(NBL_REF_ARG(shapes::SphericalTriangle) shape, const vector3_type _receiverNormal, const bool _receiverWasBSDF) { ProjectedSphericalTriangle retval; - retval.sphtri = SphericalTriangle::create(shape); - retval.receiverNormal = _receiverNormal; - retval.receiverWasBSDF = _receiverWasBSDF; + retval.sphtri = SphericalTriangle::create(shape); const scalar_type minimumProjSolidAngle = 0.0; matrix m = matrix(shape.vertices[0], shape.vertices[1], shape.vertices[2]); const vector3_type bxdfPdfAtVertex = math::conditionalAbsOrMax(_receiverWasBSDF, hlsl::mul(m, _receiverNormal), hlsl::promote(minimumProjSolidAngle)); retval.bilinearPatch = Bilinear::create(bxdfPdfAtVertex.yyxz); - const scalar_type projSA = shape.projectedSolidAngle(_receiverNormal); - retval.rcpProjSolidAngle = projSA > scalar_type(0.0) ? scalar_type(1.0) / projSA : scalar_type(0.0); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + { + retval.receiverNormal = _receiverNormal; + // prevent division of 0 cosine by 0 + retval.projSolidAngle = max(shape.projectedSolidAngle(_receiverNormal),numeric_limits::min); + } return retval; } codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC { - Bilinear bilinear = bilinearPatch; - cache.warped = bilinear.generate(u, cache.bilinearCache); - typename SphericalTriangle::cache_type sphtriCache; - const vector3_type L = sphtri.generate(cache.warped, sphtriCache); - cache.abs_cos_theta = bilinear.forwardWeight(u, cache.bilinearCache); + const vector2_type warped = bilinearPatch.generate(u, cache.bilinearCache); + typename SphericalTriangle::cache_type sphtriCache; // PDF is constant caches nothing, its empty + const codomain_type L = sphtri.generate(warped, sphtriCache); + NBL_IF_CONSTEXPR(!UsePdfAsWeight) + cache.L = L; return L; } density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC { - return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u, cache.bilinearCache); + return sphtri.rcpSolidAngle * bilinearPatch.forwardPdf(u,cache.bilinearCache); } weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC { - if (UsePdfAsWeight) - return forwardPdf(u, cache); - return cache.abs_cos_theta * rcpProjSolidAngle; - } - - density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - const vector2_type u = sphtri.generateInverse(L); - return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u); + NBL_IF_CONSTEXPR (UsePdfAsWeight) + return forwardPdf(u,cache); + return backwardWeight(cache.L); } weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC { - NBL_IF_CONSTEXPR(UsePdfAsWeight) - return backwardPdf(L); - const scalar_type minimumProjSolidAngle = 0.0; - const scalar_type abs_cos_theta = math::conditionalAbsOrMax(receiverWasBSDF, hlsl::dot(receiverNormal, L), minimumProjSolidAngle); - return abs_cos_theta * rcpProjSolidAngle; + NBL_IF_CONSTEXPR (UsePdfAsWeight) + { + const vector2_type u = sphtri.generateInverse(L); + return sphtri.rcpSolidAngle * bilinearPatch.backwardPdf(u); + } + // make the MIS weight always abs because even when receiver is a BRDF, the samples in lower hemisphere will get killed and MIS weight never used + return hlsl::abs(hlsl::dot(L,receiverNormal))/projSolidAngle; } - sampling::SphericalTriangle sphtri; + sampling::SphericalTriangle sphtri; Bilinear bilinearPatch; - scalar_type rcpProjSolidAngle; + // TODO: erase when UsePdfAsWeight==false vector3_type receiverNormal; - bool receiverWasBSDF; + scalar_type projSolidAngle; }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl new file mode 100644 index 0000000000..a44b8ff73d --- /dev/null +++ b/include/nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl @@ -0,0 +1,398 @@ +// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_PYRAMID_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SAMPLING_SPHERICAL_PYRAMID_HLSL_INCLUDED_ + +#include +#include +#include +#include +#include +#include + +namespace nbl +{ +namespace hlsl +{ +namespace sampling +{ + +// Tag-dispatched inner sampler factory: overload selected by the type of the +// default-constructed `tag` arg. Avoids the per-inner adapter struct. +inline SphericalRectangle buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, SphericalRectangle /*tag*/) +{ + return SphericalRectangle::create(basis, float32_t3(r0, 1.0f), ext); +} + +inline ProjectedSphericalRectangle buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, ProjectedSphericalRectangle /*tag*/) +{ + shapes::CompressedSphericalRectangle compressed; + compressed.origin = basis[0] * r0.x + basis[1] * r0.y + basis[2]; + compressed.right = basis[0] * ext.x; + compressed.up = basis[1] * ext.y; + return ProjectedSphericalRectangle::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false); +} + +// Spherical Pyramid: gnomonic bounding rectangle for silhouette sampling. +// +// UseCaliper=false: axis1 picks the longest world-space silhouette edge +// (one compare per edge, no inner loop, blind to perpendicular spread). +// UseCaliper=true: spherical rotating-caliper. For each candidate edge (A, B), +// the extremal opposing vertex C is found via argmax_K dot(C_K, precross) +// where precross = cross(B-A, n0); this matches argmax dot(n0, cross(C+A, C+B)) +// by the cyclic scalar triple product. Score = cos(dihedral) between the +// AB-great-circle and the Lexell-circle plane through (-A, -B, C). The +// lune cosine is a heuristic; the post-search bound pass is exact regardless. +// +// Pipeline: axis3 = normalize(-unnormCentroid); axis1 = project bestEdge3d +// onto plane(axis3); axis2 = cross(axis3, axis1); computeBound3D yields +// (rectR0, rectExtents). axis3 is not stored, reconstructed via getAxis3(). +// +// rectR0/rectExtents are returned out-params from createFromVertices and not +// stored on the pyramid (the inner sampler keeps its own copy). The local +// vertex array dies at end-of-create-scope; only the inner sampler persists. +template +struct SphericalPyramid +{ + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + // Caches the inner sampler's cache plus a pre-computed `pdf` that bakes in + // the silhouette/horizon validity test from generate(). + struct cache_type + { + typename InnerSampler::cache_type inner; + density_type pdf; + }; + + float32_t3 axis1; + float32_t3 axis2; // axis3 reconstructed via getAxis3() = cross(axis1, axis2) + + // Per-edge cross products in world space. Populated during Pass 1's + // centroid accumulation (also cached for caliper scoring), used by + // isInside(dir) in generate(). + shapes::SilEdgeNormals silEdgeNormals; + + // Constructed by create(silhouette, view) via tag-dispatched buildInner. + // The synth-vertices path (createFromVertices direct) leaves it default-init. + InnerSampler inner; + + float32_t3 getAxis3() NBL_CONST_MEMBER_FUNC { return cross(axis1, axis2); } + + // Pass 1: per-edge cross + Stokes centroid; UseCaliper=false also tracks + // the longest world edge here. Out params exist in both modes so the + // per-count cascade has one signature; DCE drops the longest-edge body when + // UseCaliper=true. + template + void processEdge(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], NBL_REF_ARG(float32_t3) unnormCentroid, NBL_REF_ARG(float32_t) bestLenSq, NBL_REF_ARG(float32_t3) bestEdge3d, NBL_REF_ARG(uint32_t) bestEdge) + { + const float32_t3 vI = vertices[I]; + const float32_t3 vJ = vertices[J]; + + const float32_t3 c = cross(vI, vJ); + silEdgeNormals.edgeNormals[I] = c; + unnormCentroid += c; + + if (!UseCaliper) + { + // Explicit nbl::hlsl::select so DXC emits scalar-conditional OpSelect + // for the vec3 update instead of a bool-broadcast v3bool. + const float32_t3 edge3d = vJ - vI; + const float32_t lenSq = dot(edge3d, edge3d); + const bool isBest = lenSq > bestLenSq; + bestLenSq = max(lenSq, bestLenSq); + bestEdge3d = nbl::hlsl::select(isBest, edge3d, bestEdge3d); + bestEdge = nbl::hlsl::select(isBest, I, bestEdge); + } + } + + // Caliper-only helpers (DCE'd when UseCaliper=false). + + // Track the silhouette vertex with max dot(vK, precross). SkipA/SkipB are + // the candidate edge's (I, J); compile-time skipped (drops the verts[K] + // read entirely). Assumes vertices are ~unit length so we can skip the + // per-K |vK| factor in the cosine. + template + static void tryK(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], float32_t3 precross, NBL_REF_ARG(float32_t) bestNum, NBL_REF_ARG(float32_t3) bestC) + { + if (K != SkipA && K != SkipB) + { + const float32_t3 vK = vertices[K]; + const float32_t num = dot(vK, precross); + const bool better = num > bestNum; + bestNum = max(num, bestNum); + bestC = nbl::hlsl::select(better, vK, bestC); + } + } + + // Cascade-on-count K scan with (I, J) as compile-time skips. bestNum seeds + // at -inf; bestC's placeholder is always overwritten (count >= 3). + template + static float32_t3 findExtremalC(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, float32_t3 precross) + { + float32_t bestNum = -1e30f; + float32_t3 bestC = vertices[0]; + tryK<0, I, J>(vertices, precross, bestNum, bestC); + tryK<1, I, J>(vertices, precross, bestNum, bestC); + tryK<2, I, J>(vertices, precross, bestNum, bestC); + if (count > 3) + { + tryK<3, I, J>(vertices, precross, bestNum, bestC); + if (count > 4) + { + tryK<4, I, J>(vertices, precross, bestNum, bestC); + if (count > 5) + { + tryK<5, I, J>(vertices, precross, bestNum, bestC); + if (count > 6) + tryK<6, I, J>(vertices, precross, bestNum, bestC); + } + } + } + return bestC; + } + + // Score candidate edge (I, J) by cos(dihedral) between AB-great-circle + // and Lexell plane through (-A, -B, C_win). Identity used: + // cross(C+A, C+B) = n0 + cross(A, C) + cross(C, B) + // so we reuse cached n0. Larger score = smaller bounding lune. max(.,1e-30f) + // keeps rsqrt finite on collapsed edges (they lose on numerator anyway). + template + static void evalCandidate(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, NBL_CONST_REF_ARG(shapes::SilEdgeNormals) sen, NBL_REF_ARG(float32_t) bestScore, NBL_REF_ARG(float32_t3) bestEdge3d, NBL_REF_ARG(uint32_t) bestEdge) + { + const float32_t3 vI = vertices[I]; + const float32_t3 vJ = vertices[J]; + const float32_t3 n0 = sen.edgeNormals[I]; + const float32_t3 edge3d = vJ - vI; + + const float32_t3 precross = cross(edge3d, n0); + const float32_t3 C = findExtremalC(vertices, count, precross); + + const float32_t3 lexell_n1 = n0 + cross(vI, C) + cross(C, vJ); + const float32_t numerator = dot(n0, lexell_n1); + const float32_t edgeDenomSq = dot(n0, n0) * dot(lexell_n1, lexell_n1); + const float32_t score = numerator * rsqrt(max(edgeDenomSq, 1e-30f)); + + const bool better = score > bestScore; + bestScore = max(score, bestScore); + bestEdge3d = nbl::hlsl::select(better, edge3d, bestEdge3d); + bestEdge = nbl::hlsl::select(better, I, bestEdge); + } + + // Gnomonic-project each silhouette vertex into the (axis1, axis2, axis3) + // frame and accumulate the AABB. + template + static void boundOne3D(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], float32_t3 axis1, float32_t3 perp, float32_t3 axis3, NBL_REF_ARG(float32_t4) bound) + { + const float32_t3 vert = vertices[I]; + const float32_t rcpDp = rcp(dot(vert, axis3)); + const float32_t x = dot(vert, axis1) * rcpDp; + const float32_t y = dot(vert, perp) * rcpDp; + bound.x = min(bound.x, x); + bound.y = min(bound.y, y); + bound.z = max(bound.z, x); + bound.w = max(bound.w, y); + } + + static void computeBound3D(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, float32_t3 axis1, float32_t3 perp, float32_t3 axis3, NBL_REF_ARG(float32_t4) bound) + { + bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); + boundOne3D<0>(vertices, axis1, perp, axis3, bound); + boundOne3D<1>(vertices, axis1, perp, axis3, bound); + boundOne3D<2>(vertices, axis1, perp, axis3, bound); + if (count > 3) + { + boundOne3D<3>(vertices, axis1, perp, axis3, bound); + if (count > 4) + { + boundOne3D<4>(vertices, axis1, perp, axis3, bound); + if (count > 5) + { + boundOne3D<5>(vertices, axis1, perp, axis3, bound); + if (count > 6) + boundOne3D<6>(vertices, axis1, perp, axis3, bound); + } + } + } + } + + // Pyramid from pre-materialized verts; (rectR0, rectExtents) returned as + // out-params (not stored on the pyramid). + static SphericalPyramid createFromVertices(float32_t3 vertices[shapes::MaxOBBSilhouetteVertices], uint32_t count, NBL_REF_ARG(float32_t2) outRectR0, NBL_REF_ARG(float32_t2) outRectExtents) + { + SphericalPyramid self; + // Sentinel-init so unused slots (count..6) produce dot(dir,(0,0,-1)) < 0 + // for the sign-bit AND in shapes::SilEdgeNormals::isInside. + self.silEdgeNormals = shapes::SilEdgeNormals::initSentinel(); + + // Tiny z-bias seed so symmetric shapes don't normalize(0) to NaN; the + // cross sum dominates for any non-degenerate silhouette. + // verts past count are zero-init by materialize, so reading them is harmless. + float32_t3 unnormCentroid = float32_t3(0.0f, 0.0f, 1e-6f); + float32_t bestLenSq = 0.0f; + float32_t3 bestEdge3d = float32_t3(1.0f, 0.0f, 0.0f); + uint32_t bestEdge = 0; + + self.processEdge<0, 1>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + self.processEdge<1, 2>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 3) + { + self.processEdge<2, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else + { + self.processEdge<2, 3>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 4) + { + self.processEdge<3, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else + { + self.processEdge<3, 4>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 5) + { + self.processEdge<4, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else + { + self.processEdge<4, 5>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 6) + { + self.processEdge<5, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else // count == 7 + { + self.processEdge<5, 6>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + self.processEdge<6, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + } + } + } + + const float32_t3 axis3 = normalize(-unnormCentroid); + + // Pass 2: caliper dihedral scan overwrites bestEdge3d. Skipped under + // UseCaliper=false (keeps Pass 1's longest edge). + if (UseCaliper) + { + float32_t bestScore = -2.0f; + + evalCandidate<0, 1>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + evalCandidate<1, 2>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 3) + { + evalCandidate<2, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else + { + evalCandidate<2, 3>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 4) + { + evalCandidate<3, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else + { + evalCandidate<3, 4>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 5) + { + evalCandidate<4, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else + { + evalCandidate<4, 5>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 6) + { + evalCandidate<5, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else // count == 7 + { + evalCandidate<5, 6>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + evalCandidate<6, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + } + } + } + } + + // axis1 = winning chord projected onto plane(axis3) and normalized. + // max(lenSq, 1e-12) keeps rsqrt finite; degenerate select picks a stable + // axis perpendicular to axis3. + const float32_t3 inPlaneEdge = bestEdge3d - axis3 * dot(bestEdge3d, axis3); + const float32_t inPlaneLenSq = dot(inPlaneEdge, inPlaneEdge); + const bool useY = abs(axis3.x) >= 0.9f; + const float32_t scale = rsqrt(max(inPlaneLenSq, 1e-12f)); + + const bool degenerate = inPlaneLenSq <= 1e-12f; + const float32_t3 fallbackAxis1 = nbl::hlsl::select(useY, float32_t3(0.0f, 1.0f, 0.0f), float32_t3(1.0f, 0.0f, 0.0f)); + self.axis1 = nbl::hlsl::select(degenerate, fallbackAxis1, inPlaneEdge * scale); + self.axis2 = cross(axis3, self.axis1); + + float32_t4 bestBound; + computeBound3D(vertices, count, self.axis1, self.axis2, axis3, bestBound); + + // Per-axis degenerate clamp: each upper bound at least 1e-6 above lower. + // Independent per axis so a single collapsed axis doesn't kill the other. + bestBound.zw = max(bestBound.zw, bestBound.xy + 1e-6f); + + outRectR0 = bestBound.xy; + outRectExtents = float32_t2(bestBound.zw - bestBound.xy); + + // Pre-rotate edge normals into local frame so per-sample inside test + // can use the cheaper 2D form (2 muls + 2 adds + n.z per edge instead + // of 3 muls + 2 adds). Amortized once per build; saves 7 muls/sample. + self.silEdgeNormals.transformToLocal(self.axis1, self.axis2, axis3); + + return self; + } + + // Materialize verts (in shading-point-relative coords baked into silhouette) + // from the silhouette, build the pyramid, then construct the InnerSampler + // via tag-dispatched buildInner. Local rect data dies at end-of-scope; only + // the inner sampler retains a copy. + static SphericalPyramid create(NBL_CONST_REF_ARG(shapes::ClippedSilhouette) silhouette, shapes::OBBView view) + { + float32_t3 vertices[shapes::MaxOBBSilhouetteVertices]; + silhouette.materialize(view, vertices); + + float32_t2 rectR0, rectExtents; + SphericalPyramid self = createFromVertices(vertices, silhouette.count, rectR0, rectExtents); + + // tag's value is unread; only its type selects the overload. + const float32_t3x3 basis = float32_t3x3(self.axis1, self.axis2, self.getAxis3()); + InnerSampler tag; + self.inner = buildInner(basis, rectR0, rectExtents, tag); + return self; + } + + // Generate via inner.generateNormalizedLocal so we can recover gnomonic + // (localX, localY) for the 2D inside test. With rectR0.z == 1, localDir.z = + // 1/hitDist, so localDir.{x,y} * hitDist == gnomonic coords. Bake + // silhouette/horizon validity into cache.pdf so forwardPdf is O(1). + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + scalar_type hitDist; + const codomain_type localDir = inner.generateNormalizedLocal(u, cache.inner, hitDist); + const codomain_type dir = localDir.x * axis1 + localDir.y * axis2 + localDir.z * getAxis3(); + const scalar_type localX = localDir.x * hitDist; + const scalar_type localY = localDir.y * hitDist; + const bool valid = dir.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); + cache.pdf = hlsl::select(valid, inner.forwardPdf(u, cache.inner), 0.0f); + return dir; + } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0u; } +}; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl index 131cc92d70..e3cbb70998 100644 --- a/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/spherical_rectangle.hlsl @@ -19,156 +19,214 @@ namespace hlsl namespace sampling { -template +// UseRealSinCos=true (default): math::sincos uses real sin+cos; full precision near au=n*pi, Jacobian test passes. +// UseRealSinCos=false : math::sincos uses cos + sqrt(1-c*c); saves one special-function op but loses mantissa as |cos_au| -> 1. +template struct SphericalRectangle { - using scalar_type = T; - using vector2_type = vector; - using vector3_type = vector; - using vector4_type = vector; - - // BackwardTractableSampler concept types - using domain_type = vector2_type; - using codomain_type = vector3_type; - using density_type = scalar_type; - using weight_type = density_type; - - struct cache_type {}; - - NBL_CONSTEXPR_STATIC_INLINE scalar_type ClampEps = 1e-5; - - static SphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) rect, const vector3_type observer) - { - return create(rect.solidAngle(observer), rect.extents); - } - - static SphericalRectangle create(NBL_CONST_REF_ARG(typename shapes::SphericalRectangle::solid_angle_type) sa, const vector2_type _extents) - { - SphericalRectangle retval; - retval.r0 = sa.r0; - retval.extents = _extents; - - retval.solidAngle = sa.value; - retval.b0 = sa.n_z[0]; - retval.b1 = sa.n_z[2]; - - math::sincos_accumulator angle_adder = math::sincos_accumulator::create(sa.cosGamma[2]); - angle_adder.addCosine(sa.cosGamma[3]); - retval.k = scalar_type(2.0) * numbers::pi - angle_adder.getSumOfArccos(); - - return retval; - } - - // Create directly from a local-frame corner position and rectangle extents. - // Use when you already know r0 (e.g. from a gnomonic projection) and don't - // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip. - static SphericalRectangle create(const vector3_type _r0, const vector2_type _extents) - { - // Same math as shapes::SphericalRectangle::solidAngle() but without - // the mul(basis, origin - observer) step since we already have r0. - typename shapes::SphericalRectangle::solid_angle_type sa; - sa.r0 = _r0; - - const scalar_type zSq = _r0.z * _r0.z; - const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x); - sa.n_z = denorm_n_z * hlsl::rsqrt(hlsl::promote(zSq) + denorm_n_z * denorm_n_z); - sa.cosGamma = vector4_type( - -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2], - -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]); - - math::sincos_accumulator acc = math::sincos_accumulator::create(sa.cosGamma[0]); - acc.addCosine(sa.cosGamma[1]); - acc.addCosine(sa.cosGamma[2]); - acc.addCosine(sa.cosGamma[3]); - sa.value = acc.getSumOfArccos() - scalar_type(2.0) * numbers::pi; - - return create(sa, _extents); - } - - // shared core of generate and generateSurfaceOffset - // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction - vector3_type __generate(const domain_type u) NBL_CONST_MEMBER_FUNC - { - // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip - const scalar_type negAbsR0z = -hlsl::abs(r0.z); - const scalar_type r0zSq = r0.z * r0.z; - const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y); - - const scalar_type au = u.x * solidAngle + k; - const scalar_type cos_au = hlsl::cos(au); - const scalar_type numerator = b1 - cos_au * b0; - // (1-cos)*(1+cos) avoids catastrophic cancellation of 1-cos^2 when cos_au is near +/-1 - const scalar_type sin_au_sq = (scalar_type(1.0) - cos_au) * (scalar_type(1.0) + cos_au); - const scalar_type absNegFu = hlsl::abs(numerator) * hlsl::rsqrt(sin_au_sq); - const scalar_type rcpCu_2 = hlsl::max(absNegFu * absNegFu + b0 * b0, scalar_type(1.0)); - // sign(negFu) = sign(numerator) * sign(sin(au)); sin(au) < 0 iff au > PI - const scalar_type negFuSign = hlsl::select((au > numbers::pi) != (numerator < scalar_type(0.0)), scalar_type(-1.0), scalar_type(1.0)); - scalar_type xu = negAbsR0z * negFuSign * hlsl::rsqrt(rcpCu_2 - scalar_type(1.0)); - xu = hlsl::clamp(xu, r0.x, r1.x); // avoid Infs - const scalar_type d_2 = xu * xu + r0zSq; - const scalar_type d = hlsl::sqrt(d_2); - - const scalar_type h0 = r0.y * hlsl::rsqrt(d_2 + r0.y * r0.y); - const scalar_type h1 = r1.y * hlsl::rsqrt(d_2 + r1.y * r1.y); - const scalar_type hv = h0 + u.y * (h1 - h0); - - return vector3_type(xu, hv, d); - } - - // returns a normalized 3D direction in the local frame with correct r0.z sign - codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - const vector3_type core = __generate(u); - const scalar_type xu = core.x; - const scalar_type hv = core.y; - const scalar_type d = core.z; - const scalar_type hv2 = hv * hv; - const scalar_type cosElevation = hlsl::sqrt(hlsl::max(scalar_type(1.0) - hv2, scalar_type(0.0))); - const scalar_type rcpD = scalar_type(1.0) / d; - - return vector3_type(xu * cosElevation * rcpD, hv, r0.z * cosElevation * rcpD); - } - - // returns a 2D offset on the rectangle surface from the r0 corner - vector2_type generateSurfaceOffset(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - const vector3_type core = __generate(u); - const scalar_type xu = core.x; - const scalar_type hv = core.y; - const scalar_type d = core.z; - const scalar_type r1y = r0.y + extents.y; - const scalar_type hv2 = hv * hv; - const scalar_type yv = hlsl::mix(r1y, (hv * d) / hlsl::sqrt(scalar_type(1.0) - hv2), hv2 < scalar_type(1.0) - ClampEps); - - return vector2_type((xu - r0.x), (yv - r0.y)); - } - - density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return scalar_type(1.0) / solidAngle; - } - - weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return forwardPdf(u, cache); - } - - density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return scalar_type(1.0) / solidAngle; - } - - weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return backwardPdf(L); - } - - scalar_type solidAngle; - scalar_type k; - scalar_type b0; - scalar_type b1; - vector3_type r0; - vector2_type extents; + using scalar_type = T; + using vector2_type = vector; + using vector3_type = vector; + using vector4_type = vector; + using matrix3x3_type = matrix; + + // BackwardTractableSampler concept types + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + struct cache_type + { + }; + + static SphericalRectangle create(NBL_CONST_REF_ARG(shapes::SphericalRectangle) rect, const vector3_type observer) + { + return create(rect.basis, rect.solidAngle(observer), rect.extents); + } + + static SphericalRectangle create(const matrix3x3_type _basis, NBL_CONST_REF_ARG(typename shapes::SphericalRectangle::solid_angle_type) sa, const vector2_type _extents) + { + SphericalRectangle retval; + retval.basis = _basis; + retval.r0 = sa.r0; + retval.extents = _extents; + + retval.solidAngle = sa.value; + retval.b0 = sa.n_z[0]; + retval.b1 = sa.n_z[2]; + + math::sincos_accumulator angle_adder = math::sincos_accumulator::create(sa.cosGamma[2]); + angle_adder.addCosine(sa.cosGamma[3]); + retval.k = _static_cast(2.0) * numbers::pi - angle_adder.getSumOfArccos(); + + return retval; + } + + // Create directly from a local-frame corner position and rectangle extents. + // Use when you already know r0 (e.g. from a gnomonic projection) and don't + // need the shapes::SphericalRectangle + solidAngle(observer) roundtrip. + static SphericalRectangle create(const matrix3x3_type _basis, const vector3_type _r0, const vector2_type _extents) + { + // Same math as shapes::SphericalRectangle::solidAngle() but without + // the mul(basis, origin - observer) step since we already have r0. + typename shapes::SphericalRectangle::solid_angle_type sa; + sa.r0 = _r0; + + const scalar_type zSq = _r0.z * _r0.z; + const vector4_type denorm_n_z = vector4_type(-_r0.y, _r0.x + _extents.x, _r0.y + _extents.y, -_r0.x); + sa.n_z = denorm_n_z * hlsl::rsqrt(hlsl::promote(zSq) + denorm_n_z * denorm_n_z); + sa.cosGamma = vector4_type( + -sa.n_z[0] * sa.n_z[1], -sa.n_z[1] * sa.n_z[2], + -sa.n_z[2] * sa.n_z[3], -sa.n_z[3] * sa.n_z[0]); + + math::sincos_accumulator acc = math::sincos_accumulator::create(sa.cosGamma[0]); + acc.addCosine(sa.cosGamma[1]); + acc.addCosine(sa.cosGamma[2]); + acc.addCosine(sa.cosGamma[3]); + sa.value = acc.getSumOfArccos() - _static_cast(2.0) * numbers::pi; + + return create(_basis, sa, _extents); + } + + // shared core of generate and generateSurfaceOffset + // returns (xu, hv, d) packed into a vector3; caller derives either 2D offset or 3D direction + struct SCommonGen + { + scalar_type xu; + scalar_type d2; + scalar_type hv; + scalar_type cosElevation2; + }; + SCommonGen __generate(const domain_type u) NBL_CONST_MEMBER_FUNC + { + SCommonGen retval; + + // algorithm needs r0.z < 0; use -abs(r0.z) without storing the flip + const scalar_type negAbsR0z = -hlsl::abs(r0.z); + const scalar_type r0zSq = r0.z * r0.z; + const vector2_type r1 = vector2_type(r0.x + extents.x, r0.y + extents.y); + + // au in [0, 4*pi] since u.x in [0,1], solidAngle <= 2*pi, k <= 2*pi. + // The sqrt path in math::sincos recovers sin sign via sign(theta), which only holds for theta in [-pi,pi]. + // Real sin/cos handle any range, so only wrap on the sqrt path (compile-time branch folds away). + scalar_type au = u.x * solidAngle + k; + NBL_IF_CONSTEXPR(!UseRealSinCos) + { + // au in [0, 4*pi] -> peel at most two 2*pi periods to land in (-pi, pi]. + au = hlsl::select(au > numbers::pi, au - _static_cast(2.0) * numbers::pi, au); + au = hlsl::select(au > numbers::pi, au - _static_cast(2.0) * numbers::pi, au); + } + scalar_type sin_au, cos_au; + math::sincos(au, sin_au, cos_au); + const scalar_type numerator = b1 - cos_au * b0; + // negFu carries the sign directly (numerator and sin_au both signed), so xu's sign drops + // out of a single multiply + hlsl::sign. + const scalar_type negFu = numerator / sin_au; + const scalar_type rcpCu_2 = hlsl::max(negFu * negFu + b0 * b0, _static_cast(1.0)); + retval.xu = negAbsR0z * hlsl::sign(negFu) * hlsl::rsqrt(rcpCu_2 - _static_cast(1.0)); + retval.xu = hlsl::clamp(retval.xu, r0.x, r1.x); // avoid Infs + retval.d2 = retval.xu * retval.xu + r0zSq; + + const scalar_type h0 = r0.y * hlsl::rsqrt(retval.d2 + r0.y * r0.y); + const scalar_type h1 = r1.y * hlsl::rsqrt(retval.d2 + r1.y * r1.y); + retval.hv = h0 + u.y * (h1 - h0); + retval.cosElevation2 = _static_cast(1.0) - hlsl::min(retval.hv * retval.hv, 1); + + return retval; + } + + // returns a normalized 3D direction in the local frame with correct r0.z sign + vector3_type generateNormalizedLocal(const domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) NBL_CONST_MEMBER_FUNC + { + const SCommonGen core = __generate(u); + scalar_type cosElevationOverD = hlsl::rsqrt(core.d2 / core.cosElevation2); + // TODO: or shall we do some other more sophisticated clamp or correction? Is this even the right one to use? + cosElevationOverD = hlsl::select(hlsl::isnan(cosElevationOverD), 1.f, cosElevationOverD); + + // TODO: investigate if due to precision we need to compute this as a `sqrt` then the quantity being computed is `core.cosElevation2/core.d2` + // which what alss `generateLocalBasisXY` needs, in which case `__generate` can already compute it and return it as `hitDist2` + hitDist = 1.f / cosElevationOverD; + + // x/hitDist == x * cosElevationOverD; saves two divs vs the obvious form. + const vector3_type retval = vector3_type(core.xu * cosElevationOverD, core.hv, r0.z * cosElevationOverD); + assert(!hlsl::isnan(computeHitT(retval))); + return retval; + } + + codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + scalar_type dummy; + const vector3_type localL = generateNormalizedLocal(u, cache, dummy); + // could return `hlsl::mul(hlsl::tranpose(basis),localL)` or just this + return basis[0] * localL[0] + basis[1] * localL[1] + basis[2] * localL[2]; + } + + // utility to determine maxT for a ray from L shot from the origin which we're sure intersects the rectangle + scalar_type computeHitT(const vector3_type L) NBL_CONST_MEMBER_FUNC + { + const scalar_type retval = hlsl::abs(r0.z / hlsl::dot(L, basis[2])); + { + const vector3_type hitPointRelative = L * retval; + const vector2_type uv = mul(basis, hitPointRelative).xy - r0.xy; + const vector2_type tol = hlsl::max(hlsl::abs(extents), hlsl::promote(_static_cast(1.0))) * _static_cast(1e-5); + assert(uv[0] >= -tol[0] && uv[0] <= extents[0] + tol[0]); + assert(uv[1] >= -tol[1] && uv[1] <= extents[1] + tol[1]); + } + return retval; + } + + // returns a 2D offset on the rectangle surface including the r0 corner -useful for generating unnormalized worldspace L + vector2_type generateLocalBasisXY(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const SCommonGen core = __generate(u); + const scalar_type r1y = r0.y + extents.y; + // TODO: see if we can compute this direct from definition of `d_2` ignoring the clamp on `xu` and `cosElevation2` + const scalar_type yv = core.hv * hlsl::rsqrt(core.cosElevation2 / core.d2); + + // fun fact, when one of the operands to min or max is NaN, the SPIR-V builtin will select the other one + // TODO: maybe try just `min(yv,ry1)` + const vector2_type retval = vector2_type(core.xu, hlsl::clamp(yv, r0.y, r1y)); + assert(retval[0] >= r0.x && retval[1] >= r0.y); + assert(retval[0] <= r0.x + extents[0] && retval[1] <= r0.y + extents[1]); + return retval; + } + + // its basically the hitpoint minus the observer origin + codomain_type generateUnnormalized(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC + { + const vector2_type localXY = generateLocalBasisXY(u, cache); + // the `localXY` already contains r0.xy + return basis[0] * localXY[0] + basis[1] * localXY[1] + basis[2] * r0.z; + } + + density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return _static_cast(1.0) / solidAngle; + } + + weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC + { + return forwardPdf(u, cache); + } + + density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC + { + return _static_cast(1.0) / solidAngle; + } + + weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC + { + return backwardPdf(L); + } + + matrix3x3_type basis; + vector3_type r0; + vector2_type extents; + scalar_type solidAngle; + scalar_type k; + scalar_type b0; + scalar_type b1; }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl index 6f29582e04..b915a55de7 100644 --- a/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl +++ b/include/nbl/builtin/hlsl/sampling/spherical_triangle.hlsl @@ -11,6 +11,7 @@ #include #include #include +#include namespace nbl { @@ -25,9 +26,6 @@ enum SphericalTriangleAlgorithm : uint16_t STA_PBRT = 1 }; -template -struct SphericalTriangle; - namespace impl { @@ -53,9 +51,8 @@ T sumOfProducts(T a, T b, T c, T d) } // namespace impl -// Non-bijective: Resamplable & Tractable (generate + pdf/weight, no inverse) -template -struct SphericalTriangle +template +struct SphericalTriangle { using scalar_type = T; using vector2_type = vector; @@ -73,22 +70,23 @@ struct SphericalTriangle static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle) tri) { SphericalTriangle retval; - retval.rcpSolidAngle = scalar_type(1.0) / tri.solid_angle; + retval.rcpSolidAngle = _static_cast(1.0) / tri.solid_angle; retval.tri_vertices[0] = tri.vertices[0]; retval.tri_vertices[1] = tri.vertices[1]; - retval.triCosC = tri.cos_sides[2]; + retval.triCosc = tri.cos_sides[2]; // precompute great circle normal of arc AC: cross(A,C) has magnitude sin(b), // so multiplying by csc(b) normalizes it; zero when side AC is degenerate - const scalar_type cscB = tri.csc_sides[1]; - const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits::max, cscB, scalar_type(0)); + const scalar_type cscb = tri.csc_sides[1]; + const vector3_type arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscb < numeric_limits::max, cscb, _static_cast(0)); retval.e_C = hlsl::cross(arcACPlaneNormal, tri.vertices[0]); retval.cosA = tri.cos_vertices[0]; retval.sinA = tri.sin_vertices[0]; if (Algorithm == STA_ARVO) { - retval.sinA_triCosC = retval.sinA * retval.triCosC; + retval.sinA_triCosc = retval.sinA * retval.triCosc; retval.eCdotB = hlsl::dot(retval.e_C, tri.vertices[1]); } + retval.APlusC = tri.vertices[0] + tri.vertices[2]; return retval; } @@ -106,36 +104,37 @@ struct SphericalTriangle if (Algorithm == STA_ARVO) // faster than PBRT { const scalar_type u_ = t - cosA; - const scalar_type v_ = s + sinA_triCosC; + const scalar_type v_ = s + sinA_triCosc; const scalar_type num = (v_ * t - u_ * s) * cosA - v_; const scalar_type denum = (v_ * s + u_ * t) * sinA; +#define ACCURATE 1 #if ACCURATE // sqrt(1 - cosBp^2) loses precision when cosBp ~ 1 (small u.x). // Use stable factorization: sinBp = sqrt((denum-num)(denum+num)) / |denum| - // where denum-num = sinA*(1+triCosC)*(1-cosA_hat). + // where denum-num = sinA*(1+triCosc)*(1-cosA_hat). // For large triangles with high u.x, cosA_hat can approach -1, // making (1 + cosA_hat) near zero and the division unstable. // Use the algebraic identity only when cosA_hat > 0 (safe denominator). - const scalar_type rcpDenum = scalar_type(1) / denum; - const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > scalar_type(0), sinA_hat * sinA_hat / (scalar_type(1) + cosA_hat), scalar_type(1) - cosA_hat); - const scalar_type DminusN = sinA * (scalar_type(1) + triCosC) * oneMinusCosAhat; - sinBp = sqrt(max(scalar_type(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum); - cosBp = scalar_type(1) - DminusN * rcpDenum; + const scalar_type rcpDenum = _static_cast(1) / denum; + const scalar_type oneMinusCosAhat = hlsl::select(cosA_hat > _static_cast(0), sinA_hat * sinA_hat / (_static_cast(1) + cosA_hat), _static_cast(1) - cosA_hat); + const scalar_type DminusN = sinA * (_static_cast(1) + triCosc) * oneMinusCosAhat; + sinBp = sqrt(max(_static_cast(0), DminusN * (denum + num))) * nbl::hlsl::abs(rcpDenum); + cosBp = _static_cast(1) - DminusN * rcpDenum; #else // 17% faster, less accurate cosBp = num / denum; - sinBp = sqrt(max(scalar_type(0), scalar_type(1) - cosBp * cosBp)); + sinBp = sqrt(max(_static_cast(0), _static_cast(1) - cosBp * cosBp)); #endif } else // STA_PBRT, accurate, slowest { // PBRT uses cosPhi = -t, sinPhi = -s (pi offset from Arvo's A_hat) const scalar_type k1 = -t + cosA; - const scalar_type k2 = -s - sinA * triCosC; + const scalar_type k2 = -s - sinA * triCosc; cosBp = (k2 + impl::differenceOfProducts(k2, -t, k1, -s) * cosA) / (impl::sumOfProducts(k2, -s, k1, -t) * sinA); - cosBp = nbl::hlsl::clamp(cosBp, scalar_type(-1), scalar_type(1)); - sinBp = sqrt(max(scalar_type(0), scalar_type(1) - cosBp * cosBp)); + cosBp = nbl::hlsl::clamp(cosBp, _static_cast(-1), _static_cast(1)); + sinBp = sqrt(_static_cast(1) - cosBp * cosBp); } // Step 3: construct C' on the great circle through A toward C @@ -143,81 +142,24 @@ struct SphericalTriangle // Step 4: uniformly sample the great circle arc from B to C' scalar_type cosCpB; - if (Algorithm == STA_ARVO) - cosCpB = cosBp * triCosC + sinBp * eCdotB; + NBL_IF_CONSTEXPR(Algorithm == STA_ARVO) + cosCpB = cosBp * triCosc + sinBp * eCdotB; else cosCpB = nbl::hlsl::dot(cp, tri_vertices[1]); - const scalar_type z = scalar_type(1) - u.y * (scalar_type(1) - cosCpB); - const scalar_type sinZ = sqrt(max(scalar_type(0), scalar_type(1) - z * z)); + // TODO: degeneracy at u.y = 0. z = 1 - u.y*(1-cosCpB) makes sinZ = sqrt(1-z^2) behave like + // sqrt(u.y) near zero, so dL/du.y diverges as u.y^(-1/2) and every higher derivative diverges + // faster. The forward Jacobian test in 37_HLSLSamplingTests reports ~2-8% error at u.y < 0.003 + // even with the O(h^2) one-sided stencil because the third-derivative term dominates. At + // u.y = 0 exactly, L collapses to vertex B for all u.x (|det J| = 0), so it's an intrinsic + // property of the Arvo parameterization, not a bug. Fix: rework the arc interpolation to use + // a u.y -> angle mapping whose derivatives stay bounded near u.y = 0 (e.g. acos(z) = angle + // from B, then sample arc-length linearly), so the Jacobian is smooth and the skip band in + // the tester can be removed. + const scalar_type z = _static_cast(1) - u.y * (_static_cast(1) - cosCpB); + const scalar_type sinZ = sqrt(max(_static_cast(0), _static_cast(1) - z * z)); return z * tri_vertices[1] + sinZ * hlsl::normalize(cp - cosCpB * tri_vertices[1]); } - density_type forwardPdf(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle; - } - - weight_type forwardWeight(const domain_type u, const cache_type cache) NBL_CONST_MEMBER_FUNC - { - return forwardPdf(u, cache); - } - - density_type backwardPdf(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return rcpSolidAngle; - } - - weight_type backwardWeight(const codomain_type L) NBL_CONST_MEMBER_FUNC - { - return backwardPdf(L); - } - - scalar_type rcpSolidAngle; - scalar_type cosA; - scalar_type sinA; - scalar_type sinA_triCosC; // precomputed sinA * triCosC - scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only - - vector3_type tri_vertices[2]; // A and B only - scalar_type triCosC; - vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane -}; - -// Bijective: adds generateInverse, stores extra members for the inverse mapping -template -struct SphericalTriangle -{ - using scalar_type = T; - using vector2_type = vector; - using vector3_type = vector; - - using base_type = SphericalTriangle; - using domain_type = vector2_type; - using codomain_type = vector3_type; - using density_type = scalar_type; - using weight_type = density_type; - - using cache_type = typename base_type::cache_type; - - static SphericalTriangle create(NBL_CONST_REF_ARG(shapes::SphericalTriangle) tri) - { - SphericalTriangle retval; - retval.base = base_type::create(tri); - retval.rcpSolidAngle = retval.base.rcpSolidAngle; - retval.vertexC = tri.vertices[2]; - // precompute great circle normal of arc AC (needed for generateInverse) - const scalar_type cscB = tri.csc_sides[1]; - retval.arcACPlaneNormal = hlsl::cross(tri.vertices[0], tri.vertices[2]) * hlsl::select(cscB < numeric_limits::max, cscB, scalar_type(0)); - retval.triCscC = tri.csc_sides[2]; - return retval; - } - - codomain_type generate(const domain_type u, NBL_REF_ARG(cache_type) cache) NBL_CONST_MEMBER_FUNC - { - return base.generate(u, cache); - } - - // generate() works in two steps: // u.x -> pick C' on arc AC (choosing a sub-area fraction) // u.y -> pick L on arc B->C' (linear interpolation) @@ -229,37 +171,34 @@ struct SphericalTriangle domain_type generateInverse(const codomain_type L) NBL_CONST_MEMBER_FUNC { // Step 1: find C' = intersection of great circles (B,L) and (A,C) - const vector3_type BxL = nbl::hlsl::cross(base.tri_vertices[1], L); + const vector3_type BxL = nbl::hlsl::cross(tri_vertices[1], L); const scalar_type sinBL_sq = nbl::hlsl::dot(BxL, BxL); - if (sinBL_sq < numeric_limits::epsilon) + + // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t). + // C' also lies on the B-L plane, so dot(BxL, C') = 0. + // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R + const scalar_type tripleA = nbl::hlsl::dot(BxL, tri_vertices[0]); + const scalar_type tripleE = nbl::hlsl::dot(BxL, e_C); + const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE; + + if (sinBL_sq < numeric_limits::epsilon || R_sq < numeric_limits::epsilon) { - // L ~ B: u.y ~ 0, u.x is indeterminate (all u.x map to B when u.y=0). // Recover u.y from |L-B|^2 / |A-B|^2 (using C'=A; the (1-cosCpB) ratio // cancels so any C' gives the same result). - const vector3_type LminusB = L - base.tri_vertices[1]; - const vector3_type AminusB = base.tri_vertices[0] - base.tri_vertices[1]; + const vector3_type LminusB = L - tri_vertices[1]; + const vector3_type AminusB = tri_vertices[0] - tri_vertices[1]; const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB); const scalar_type v_denom = nbl::hlsl::dot(AminusB, AminusB); const scalar_type v = hlsl::select(v_denom > numeric_limits::epsilon, - nbl::hlsl::clamp(v_num / v_denom, scalar_type(0.0), scalar_type(1.0)), - scalar_type(0.0)); - return vector2_type(scalar_type(0.0), v); + nbl::hlsl::clamp(v_num / v_denom, _static_cast(0.0), _static_cast(1.0)), + _static_cast(0.0)); + return vector2_type(_static_cast(0.0), v); } - // C' lies on arc AC, so C' = A*cos(t) + e_C*sin(t). - // C' also lies on the B-L plane, so dot(BxL, C') = 0. - // Solving: (cos(t), sin(t)) = (tripleE, -tripleA) / R - const scalar_type tripleA = nbl::hlsl::dot(BxL, base.tri_vertices[0]); - const scalar_type tripleE = nbl::hlsl::dot(BxL, base.e_C); - const scalar_type R_sq = tripleA * tripleA + tripleE * tripleE; - if (R_sq < numeric_limits::epsilon) - return vector2_type(scalar_type(0.0), scalar_type(0.0)); - - const scalar_type rcpR = scalar_type(1.0) / nbl::hlsl::sqrt(R_sq); - vector3_type cp = base.tri_vertices[0] * (tripleE * rcpR) + base.e_C * (-tripleA * rcpR); - // two intersections exist; pick the one on the minor arc A->C - if (nbl::hlsl::dot(cp, base.tri_vertices[0] + vertexC) < scalar_type(0.0)) - cp = -cp; + const scalar_type rcpR = _static_cast(1.0) / nbl::hlsl::sqrt(R_sq); + vector3_type cp = tri_vertices[0] * (tripleE * rcpR) + e_C * (-tripleA * rcpR); + // two intersections exist; pick the one on the minor arc A->C (branchless sign flip) + cp = ieee754::flipSignIfRHSNegative(cp, nbl::hlsl::dot(cp, APlusC)); // Step 2: u.x = solidAngle(A,B,C') / solidAngle(A,B,C) // Van Oosterom-Strackee: tan(Omega/2) = |A.(BxC')| / (1 + A.B + B.C' + C'.A) @@ -270,25 +209,25 @@ struct SphericalTriangle // Expanding C' = cosBp*A + sinBp*e_C into the triple product: // A.(BxC') = cosBp * A.(BxA) + sinBp * A.(Bxe_C) = sinBp * A.(Bxe_C) // since A.(BxA) = 0 identically. This avoids the cancellation. - const scalar_type cosBp_inv = nbl::hlsl::dot(cp, base.tri_vertices[0]); - const scalar_type sinBp_inv = nbl::hlsl::dot(cp, base.e_C); - const scalar_type AxBdotE = nbl::hlsl::dot(base.tri_vertices[0], nbl::hlsl::cross(base.tri_vertices[1], base.e_C)); + const scalar_type cosBp_inv = nbl::hlsl::dot(cp, tri_vertices[0]); + const scalar_type sinBp_inv = nbl::hlsl::dot(cp, e_C); + const scalar_type AxBdotE = nbl::hlsl::dot(tri_vertices[0], nbl::hlsl::cross(tri_vertices[1], e_C)); const scalar_type num = sinBp_inv * AxBdotE; - const scalar_type cosCpB = nbl::hlsl::dot(base.tri_vertices[1], cp); - const scalar_type den = scalar_type(1.0) + base.triCosC + cosCpB + cosBp_inv; - const scalar_type subSolidAngle = scalar_type(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den); - const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, scalar_type(0.0), scalar_type(1.0)); + const scalar_type cosCpB = nbl::hlsl::dot(tri_vertices[1], cp); + const scalar_type den = _static_cast(1.0) + triCosc + cosCpB + cosBp_inv; + const scalar_type subSolidAngle = _static_cast(2.0) * nbl::hlsl::atan2(nbl::hlsl::abs(num), den); + const scalar_type u = nbl::hlsl::clamp(subSolidAngle * rcpSolidAngle, _static_cast(0.0), _static_cast(1.0)); // Step 3: u.y = |L-B|^2 / |C'-B|^2 // Squared Euclidean distance avoids catastrophic cancellation vs (1-dot)/(1-dot) - const vector3_type LminusB = L - base.tri_vertices[1]; - const vector3_type cpMinusB = cp - base.tri_vertices[1]; + const vector3_type LminusB = L - tri_vertices[1]; + const vector3_type cpMinusB = cp - tri_vertices[1]; const scalar_type v_num = nbl::hlsl::dot(LminusB, LminusB); const scalar_type v_denom = nbl::hlsl::dot(cpMinusB, cpMinusB); const scalar_type v = hlsl::select(v_denom > numeric_limits::epsilon, nbl::hlsl::clamp(v_num / nbl::hlsl::max(v_denom, numeric_limits::min), - scalar_type(0.0), scalar_type(1.0)), - scalar_type(0.0)); + _static_cast(0.0), _static_cast(1.0)), + _static_cast(0.0)); return vector2_type(u, v); } @@ -313,13 +252,16 @@ struct SphericalTriangle return backwardPdf(L); } - // mirrored from base for uniform access across both specializations scalar_type rcpSolidAngle; + scalar_type cosA; + scalar_type sinA; + scalar_type sinA_triCosc; // precomputed sinA * triCosc + scalar_type eCdotB; // precomputed dot(e_C, tri_vertices[1]), Arvo only - base_type base; - vector3_type vertexC; - vector3_type arcACPlaneNormal; // precomputed normalize(cross(A, C)), great circle normal of arc AC - scalar_type triCscC; + vector3_type tri_vertices[2]; // A and B only + scalar_type triCosc; + vector3_type e_C; // precomputed cross(arcACPlaneNormal, A), unit vector perp to A in A-C plane + vector3_type APlusC; // precomputed A + C, used to pick the minor-arc intersection in generateInverse }; } // namespace sampling diff --git a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl index 21901a9628..0d22323921 100644 --- a/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl +++ b/include/nbl/builtin/hlsl/sampling/uniform_spheres.hlsl @@ -35,10 +35,11 @@ struct UniformHemisphere static codomain_type generate(const domain_type u) { - typename ConcentricMapping::cache_type cmCache; + typename ConcentricMapping::cache_type cmCache; const vector_t2 p = ConcentricMapping::generate(u, cmCache); - const T z = T(1.0) - cmCache.r2; - const T xyScale = hlsl::sqrt(hlsl::max(T(0.0), T(2.0) - cmCache.r2)); + assert(cmCache.r2 <= _static_cast(1.0)); + const T z = _static_cast(1.0) - cmCache.r2; + const T xyScale = hlsl::sqrt(_static_cast(2.0) - cmCache.r2); return vector_t3(p.x * xyScale, p.y * xyScale, z); } @@ -50,30 +51,30 @@ struct UniformHemisphere static domain_type generateInverse(const codomain_type v) { // r_disk / r_xy = sqrt(1-z) / sqrt(1-z^2) = 1/sqrt(1+z) - const T scale = T(1.0) / hlsl::sqrt(T(1.0) + v.z); + const scalar_type scale = hlsl::rsqrt(_static_cast(1.0) + v.z); return ConcentricMapping::generateInverse(vector_t2(v.x * scale, v.y * scale)); } static density_type forwardPdf(const domain_type u, const cache_type cache) { - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } static weight_type forwardWeight(const domain_type u, const cache_type cache) { - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } static density_type backwardPdf(const codomain_type v) { assert(v.z > 0); - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } static weight_type backwardWeight(const codomain_type v) { assert(v.z > 0); - return T(0.5) * numbers::inv_pi; + return _static_cast(0.5) * numbers::inv_pi; } }; @@ -96,7 +97,7 @@ struct UniformSphere static codomain_type generate(const domain_type u) { - const T tmp = u.x * T(2.0) - T(1.0); + const T tmp = u.x * _static_cast(2.0) - _static_cast(1.0); const codomain_type L = hemisphere_t::generate(vector_t2(hlsl::abs(tmp), u.y)); return vector_t3(L.x, L.y, L.z * hlsl::sign(tmp)); } @@ -108,29 +109,29 @@ struct UniformSphere static domain_type generateInverse(const codomain_type v) { - const T dir = hlsl::sign(v.z) * T(0.5); + const T dir = hlsl::sign(v.z) * _static_cast(0.5); const domain_type hemiU = hemisphere_t::generateInverse(vector_t3(v.x, v.y, hlsl::abs(v.z))); - return vector_t2(hemiU.x * dir + T(0.5), hemiU.y); + return vector_t2(hemiU.x * dir + _static_cast(0.5), hemiU.y); } static density_type forwardPdf(const domain_type u, const cache_type cache) { - return T(0.5) * hemisphere_t::forwardPdf(u, cache); + return _static_cast(0.5) * hemisphere_t::forwardPdf(u, cache); } static weight_type forwardWeight(const domain_type u, const cache_type cache) { - return T(0.5) * hemisphere_t::forwardWeight(u, cache); + return _static_cast(0.5) * hemisphere_t::forwardWeight(u, cache); } static density_type backwardPdf(const codomain_type v) { - return T(0.25) * numbers::inv_pi; + return _static_cast(0.25) * numbers::inv_pi; } static weight_type backwardWeight(const codomain_type v) { - return T(0.25) * numbers::inv_pi; + return _static_cast(0.25) * numbers::inv_pi; } }; diff --git a/include/nbl/builtin/hlsl/shapes/obb.hlsl b/include/nbl/builtin/hlsl/shapes/obb.hlsl index bdddc48ebf..763d434518 100644 --- a/include/nbl/builtin/hlsl/shapes/obb.hlsl +++ b/include/nbl/builtin/hlsl/shapes/obb.hlsl @@ -26,17 +26,17 @@ struct OBB axesScale[dim_i] = axes[dim_i] * obbScale[dim_i]; } OBB ret; - for (int row_i = 0; row_i < D; row_i++) + for (int16_t row_i = 0; row_i < D; row_i++) { - for (int col_i = 0; col_i < D; col_i++) + for (int16_t col_i = 0; col_i < D; col_i++) { ret.transform[row_i][col_i] = axesScale[col_i][row_i]; } } - for (int dim_i = 0; dim_i < D; dim_i++) + for (int16_t dim_i = 0; dim_i < D; dim_i++) { scalar_t sum = 0; - for (int dim_j = 0; dim_j < D; dim_j++) + for (int16_t dim_j = 0; dim_j < D; dim_j++) { sum += axesScale[dim_j][dim_i]; } @@ -49,7 +49,7 @@ struct OBB NBL_CONSTEXPR_STATIC_INLINE OBB createAxisAligned(point_t mid, point_t len) { point_t axes[D]; - for (auto dim_i = 0; dim_i < D; dim_i++) + for (int16_t dim_i = 0; dim_i < D; dim_i++) { axes[dim_i] = point_t(0); axes[dim_i][dim_i] = 1; @@ -60,6 +60,106 @@ struct OBB matrix transform; }; +// Decomposed OBB view: caches columns and minCorner for fast vertex queries. +// Same 12-float footprint as float32_t3x4, but laid out for branchless +// corner computation. +// Decomposed OBB view: caches columns and minCorner for fast vertex queries. +// columns[i] are the full OBB edge vectors; minCorner is the world position +// of corner 0b000 = center - 0.5*(col0+col1+col2). +template +struct OBBView +{ + using scalar_t = Scalar; + using vec3_t = vector; + using mat3_t = matrix; + + mat3_t columns; + vec3_t minCorner; + + static OBBView create(matrix modelMatrix) + { + matrix m = transpose(modelMatrix); + OBBView v; + v.columns = mat3_t(m[0].xyz, m[1].xyz, m[2].xyz); + v.minCorner = m[3].xyz - scalar_t(0.5) * (m[0].xyz + m[1].xyz + m[2].xyz); + return v; + } + + vec3_t getCenter() NBL_CONST_MEMBER_FUNC + { + return minCorner + scalar_t(0.5) * (columns[0] + columns[1] + columns[2]); + } + + vec3_t getVertex(uint32_t i) NBL_CONST_MEMBER_FUNC + { + vec3_t p = minCorner; + if (i & 1u) p += columns[0]; + if (i & 2u) p += columns[1]; + if (i & 4u) p += columns[2]; + return p; + } + + // Scalar-z specialization: only computes the z component of a corner. + // Used for clip-mask build where we only need the sign of z. + scalar_t getVertexZ(uint32_t i) NBL_CONST_MEMBER_FUNC + { + scalar_t pz = minCorner.z; + if (i & 1u) pz += columns[0].z; + if (i & 2u) pz += columns[1].z; + if (i & 4u) pz += columns[2].z; + return pz; + } + + // Ray-OBB intersection via slab test in OBB-local space. + // Returns (tMin, tMax, hit). tMin is clamped to 0 if ray starts inside. + // TODO: not optimized -- precompute inverse columns, handle f~=0 edge case + struct Intersection + { + scalar_t tMin; + scalar_t tMax; + bool hit; + }; + + Intersection rayIntersection(vec3_t rayOrigin, vec3_t rayDir) NBL_CONST_MEMBER_FUNC + { + // Vector from ray origin to minCorner + vec3_t delta = rayOrigin - minCorner; + + scalar_t tMin = scalar_t(-1e30); + scalar_t tMax = scalar_t(1e30); + + // Slab test against each OBB axis + for (uint32_t i = 0; i < 3; i++) + { + vec3_t axis = columns[i]; + scalar_t axisLenSq = dot(axis, axis); + scalar_t e = dot(axis, delta); + scalar_t f = dot(axis, rayDir); + + // Project ray onto axis: entry at t where dot = 0, exit where dot = axisLenSq + scalar_t invF = scalar_t(1.0) / f; + scalar_t t1 = (-e) * invF; + scalar_t t2 = (axisLenSq - e) * invF; + + if (t1 > t2) + { + scalar_t tmp = t1; + t1 = t2; + t2 = tmp; + } + + tMin = max(tMin, t1); + tMax = min(tMax, t2); + } + + Intersection result; + result.hit = (tMax >= tMin) && (tMax > scalar_t(0)); + result.tMin = max(tMin, scalar_t(0)); + result.tMax = tMax; + return result; + } +}; + } } } diff --git a/include/nbl/builtin/hlsl/shapes/obb_silhouette.hlsl b/include/nbl/builtin/hlsl/shapes/obb_silhouette.hlsl new file mode 100644 index 0000000000..83c668033c --- /dev/null +++ b/include/nbl/builtin/hlsl/shapes/obb_silhouette.hlsl @@ -0,0 +1,453 @@ +// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_BUILTIN_HLSL_SHAPES_OBB_SILHOUETTE_HLSL_INCLUDED_ +#define _NBL_BUILTIN_HLSL_SHAPES_OBB_SILHOUETTE_HLSL_INCLUDED_ + +#include +#include +#include +#include + +namespace nbl +{ +namespace hlsl +{ +namespace shapes +{ + +// Max vertices in an OBB silhouette after horizon clipping: a 6-vertex +// silhouette can gain at most one extra vertex from the partial clip endpoints. +NBL_CONSTEXPR uint32_t MaxOBBSilhouetteVertices = 7; + +// ============================================================================ +// 27-config silhouette table for an axis-aligned cube viewed from any of the +// 27 region cells (3 per axis). Each entry: {count, v0, v1, v2, v3, v4, v5} +// with vertices in CCW order relative to the viewer. +// ============================================================================ +// Human-readable LUT kept for reference / debugging. The hot path uses the +// packed binSilhouettes form below. +static const uint32_t silhouettes[27][7] = { + {6, 1, 3, 2, 6, 4, 5}, // 0: Black + {6, 2, 6, 4, 5, 7, 3}, // 1: White + {6, 0, 4, 5, 7, 3, 2}, // 2: Gray + {6, 1, 3, 7, 6, 4, 5}, // 3: Red + {4, 4, 5, 7, 6, 0, 0}, // 4: Green + {6, 0, 4, 5, 7, 6, 2}, // 5: Blue + {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow + {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta + {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan + {6, 1, 3, 2, 6, 7, 5}, // 9: Orange + {4, 2, 6, 7, 3, 0, 0}, // 10: Light Orange + {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange + {4, 1, 3, 7, 5, 0, 0}, // 12: Pink + {4, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 2, 0, 0}, // 14: Deep Rose + {6, 0, 1, 3, 7, 5, 4}, // 15: Purple + {4, 0, 1, 5, 4, 0, 0}, // 16: Light Purple + {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo + {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green + {6, 0, 2, 6, 7, 3, 1}, // 19: Lime + {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green + {6, 0, 2, 3, 7, 5, 1}, // 21: Navy + {4, 0, 2, 3, 1, 0, 0}, // 22: Sky Blue + {6, 0, 4, 6, 2, 3, 1}, // 23: Teal + {6, 0, 2, 3, 7, 5, 4}, // 24: Brown + {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige + {6, 1, 5, 4, 6, 2, 3}, // 26: Dark Brown +}; + +// Binary packed silhouettes: bits 0-17 hold 6 corner indices (3 bits each), +// bits 29-31 hold the vertex count. Hot-path uses this; LUT above is reference. +static const uint32_t binSilhouettes[27] = { + 0b11000000000000101100110010011001, + 0b11000000000000011111101100110010, + 0b11000000000000010011111101100000, + 0b11000000000000101100110111011001, + 0b10000000000000000000110111101100, + 0b11000000000000010110111101100000, + 0b11000000000000100110111011001000, + 0b11000000000000100110111101001000, + 0b11000000000000010110111101001000, + 0b11000000000000101111110010011001, + 0b10000000000000000000011111110010, + 0b11000000000000010011111110100000, + 0b10000000000000000000101111011001, + 0b11000000000000010011111110100000, + 0b10000000000000000000010110100000, + 0b11000000000000100101111011001000, + 0b10000000000000000000100101001000, + 0b11000000000000010110100101001000, + 0b11000000000000001101111110010000, + 0b11000000000000001011111110010000, + 0b11000000000000001011111110100000, + 0b11000000000000001101111011010000, + 0b10000000000000000000001011010000, + 0b11000000000000001011010110100000, + 0b11000000000000100101111011010000, + 0b11000000000000100101001011010000, + 0b11000000000000011010110100101001, +}; + +struct BinSilhouette +{ + static BinSilhouette create(uint32_t configIndex) + { + BinSilhouette s; + s.data = binSilhouettes[configIndex]; + return s; + } + + uint32_t getVertexIndex(uint32_t index) NBL_CONST_MEMBER_FUNC + { + return (data >> (3u * index)) & 0x7u; + } + + uint32_t getVertexCount() NBL_CONST_MEMBER_FUNC + { + return (data >> 29u) & 0x7u; + } + + void rotr(uint32_t shift, uint32_t size) + { + data = nbl::hlsl::rotr(data, shift, size); + } + + void rotl(uint32_t shift, uint32_t size) + { + data = nbl::hlsl::rotl(data, shift, size); + } + + uint32_t data; +}; + +// Metadata-only descriptor of a clipped OBB silhouette (12 bytes). Vertex +// positions are NOT stored, consumers call materialize(view, verts) to +// fill a local array on demand, keeping vec3 storage out of struct-passing. +// +// silData: bits 0-17 rotated 3-bit corner indices (positive-z corners first +// in CCW order, then negative-z), bits 24-28 configIndex, bits 29-31 silhouette size. +// positiveCount: positive-z corners surviving the clip. +// count: emitted vertex count (positiveCount + 2 on partial clip, 0 if fully clipped). +struct ClippedSilhouette +{ + uint32_t silData; // rotated BinSilhouette data + size + uint32_t positiveCount; // # of positive-z OBB corners after rotation + uint32_t count; // total emitted vertex count consumers cascade on + float32_t3 shadingPoint; // observer position; baked into clipping + materialize + + static ClippedSilhouette create(shapes::OBBView view, float32_t3 shadingPoint) + { + uint32_t3 region; + uint32_t configIndex, vertexCount; + // OBB-local observer coord along axis i is dot(col_i, shadingPoint - minCorner); + // compare against [0, |col_i|^2] for branchless 27-config classify. + const float32_t3 toMin = view.minCorner - shadingPoint; + float32_t3 sqScales = float32_t3(dot(view.columns[0], view.columns[0]), dot(view.columns[1], view.columns[1]), dot(view.columns[2], view.columns[2])); + float32_t3 proj = -float32_t3(dot(view.columns[0], toMin), dot(view.columns[1], toMin), dot(view.columns[2], toMin)); + + uint32_t3 below = uint32_t3(proj < float32_t3(0, 0, 0)); + uint32_t3 above = uint32_t3(proj > sqScales); + region = uint32_t3(uint32_t3(1u, 1u, 1u) + below - above); + + configIndex = region.x + region.y * 3u + region.z * 9u; + + BinSilhouette sil = BinSilhouette::create(configIndex); + vertexCount = sil.getVertexCount(); + + // Always evaluate all 6 slots so the loop unrolls without a runtime + // branch on vertexCount; high bits are masked off below. + uint32_t validMask = (1u << vertexCount) - 1u; + uint32_t clipMask = 0u; + NBL_UNROLL + for (uint32_t i = 0; i < 6; i++) + clipMask |= (hlsl::select(view.getVertexZ(sil.getVertexIndex(i)) < shadingPoint.z, 1u, 0u)) << i; + clipMask &= validMask; + + uint32_t clipCount = countbits(clipMask); + uint32_t invertedMask = ~clipMask & validMask; + + // clipMask is masked to validMask, so the shift can't pull garbage into bit 0. + bool wrapAround = (clipMask & (clipMask >> (vertexCount - 1))) != 0u; + + uint32_t rotateAmount = nbl::hlsl::select(wrapAround, firstbitlow(invertedMask), // first positive + firstbithigh(clipMask) + 1); // first vertex after last negative + + sil.rotr(rotateAmount * 3, vertexCount * 3); + + ClippedSilhouette self; + // rotr wipes bits above width, so re-inject vertexCount and pack configIndex. + self.silData = sil.data | (configIndex << 24u) | (vertexCount << 29u); + self.positiveCount = vertexCount - clipCount; + const bool fullyClipped = (clipCount == vertexCount); + const bool partialClip = (clipCount > 0) && !fullyClipped; + self.count = nbl::hlsl::select(fullyClipped, 0u, self.positiveCount + (partialClip ? 2u : 0u)); + self.shadingPoint = shadingPoint; + + return self; + } + + uint32_t cornerIndex(uint32_t k) NBL_CONST_MEMBER_FUNC + { + return (silData >> (3u * k)) & 0x7u; + } + + uint32_t getVertexCount() NBL_CONST_MEMBER_FUNC { return (silData >> 29u) & 0x7u; } + uint32_t getConfigIndex() NBL_CONST_MEMBER_FUNC { return (silData >> 24u) & 0x1Fu; } + uint32_t3 getRegion() NBL_CONST_MEMBER_FUNC + { + const uint32_t ci = getConfigIndex(); + return uint32_t3(ci % 3u, (ci / 3u) % 3u, ci / 9u); + } + BinSilhouette getOriginalBinSilhouette() NBL_CONST_MEMBER_FUNC { return BinSilhouette::create(getConfigIndex()); } + + // Fill `count` vertices into the caller's local array. Each vertex is + // view.getVertex(cornerIndex(K)), columns[0/1/2] indexed by literal so + // SROA keeps them in registers and the 3 conditional adds run in parallel. + // Cascade on count rather than for+break so every vertices[K] write uses + // a literal slot index, otherwise the array demotes to Function memory. + // Vertices are returned in shading-point-relative coordinates (i.e. + // view.getVertex(...) - shadingPoint), so direction-from-shading-point + // reductions in consumers (cross/dot, gnomonic projection, horizon clip + // to z=0) are correct. + void materialize(shapes::OBBView view, out float32_t3 vertices[MaxOBBSilhouetteVertices]) NBL_CONST_MEMBER_FUNC + { + // Zero the unused tail; some consumers (DCE sinks) read + // the full 7-wide array. + NBL_UNROLL + for (uint32_t init = 0; init < MaxOBBSilhouetteVertices; init++) + vertices[init] = float32_t3(0.0f, 0.0f, 0.0f); + if (count == 0) + return; + + vertices[0] = view.getVertex(cornerIndex(0)) - shadingPoint; + if (positiveCount > 1) + { + vertices[1] = view.getVertex(cornerIndex(1)) - shadingPoint; + if (positiveCount > 2) + { + vertices[2] = view.getVertex(cornerIndex(2)) - shadingPoint; + if (positiveCount > 3) + { + vertices[3] = view.getVertex(cornerIndex(3)) - shadingPoint; + if (positiveCount > 4) + { + vertices[4] = view.getVertex(cornerIndex(4)) - shadingPoint; + if (positiveCount > 5) + { + vertices[5] = view.getVertex(cornerIndex(5)) - shadingPoint; + if (positiveCount > 6) + vertices[6] = view.getVertex(cornerIndex(6)) - shadingPoint; + } + } + } + } + } + + // Partial-clip: two extra getVertex calls for the negative-z endpoints + // around the positive run, lerped to z=0 (in shading-point-relative + // frame). Cascaded for literal slot indices. + if (count > positiveCount) + { + const uint32_t silSize = (silData >> 29u) & 0x7u; + const float32_t3 vFirstNeg = view.getVertex(cornerIndex(positiveCount)) - shadingPoint; + const float32_t3 vLastNeg = view.getVertex(cornerIndex(silSize - 1u)) - shadingPoint; + const float32_t3 vFirstPos = vertices[0]; + + if (positiveCount == 1) + { + const float32_t3 vLastPos = vertices[0]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[1] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[2] = lerp(vLastNeg, vFirstPos, tB); + } + else if (positiveCount == 2) + { + const float32_t3 vLastPos = vertices[1]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[2] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[3] = lerp(vLastNeg, vFirstPos, tB); + } + else if (positiveCount == 3) + { + const float32_t3 vLastPos = vertices[2]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[3] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[4] = lerp(vLastNeg, vFirstPos, tB); + } + else if (positiveCount == 4) + { + const float32_t3 vLastPos = vertices[3]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[4] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[5] = lerp(vLastNeg, vFirstPos, tB); + } + else // positiveCount == 5; positiveCount == 6 -> count == 8 > 7, impossible + { + const float32_t3 vLastPos = vertices[4]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[5] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[6] = lerp(vLastNeg, vFirstPos, tB); + } + } + } + + // materialize + per-vertex normalize. Cascaded for literal slot indices. + void materializeNormalized(shapes::OBBView view, out float32_t3 vertices[MaxOBBSilhouetteVertices]) NBL_CONST_MEMBER_FUNC + { + materialize(view, vertices); + vertices[0] = nbl::hlsl::normalize(vertices[0]); + if (count > 1) + { + vertices[1] = nbl::hlsl::normalize(vertices[1]); + if (count > 2) + { + vertices[2] = nbl::hlsl::normalize(vertices[2]); + if (count > 3) + { + vertices[3] = nbl::hlsl::normalize(vertices[3]); + if (count > 4) + { + vertices[4] = nbl::hlsl::normalize(vertices[4]); + if (count > 5) + { + vertices[5] = nbl::hlsl::normalize(vertices[5]); + if (count > 6) + vertices[6] = nbl::hlsl::normalize(vertices[6]); + } + } + } + } + } + } +}; + +struct SilEdgeNormals +{ + // Sentinel for unused edge slots: dot(dir, (0,0,-1)) = -dir.z. Callers + // gate isInside on dir.z > 0, so this dot is always negative for them, + // its asuint has the sign bit set, which makes the bitwise-AND + // reduction in isInside() pass through the real sign bits unchanged. + static SilEdgeNormals initSentinel() + { + SilEdgeNormals result; + NBL_UNROLL + for (uint32_t i = 0; i < MaxOBBSilhouetteVertices; i++) + result.edgeNormals[i] = float32_t3(0.0f, 0.0f, -1.0f); + return result; + } + + // Build per-edge cross products from a materialized vertex array. + static SilEdgeNormals create(float32_t3 vertices[MaxOBBSilhouetteVertices], uint32_t count) + { + SilEdgeNormals result = initSentinel(); + + float32_t3 v0 = vertices[0]; + float32_t3 v1 = vertices[1]; + float32_t3 v2 = vertices[2]; + + result.edgeNormals[0] = cross(v0, v1); + result.edgeNormals[1] = cross(v1, v2); + + if (count > 3) + { + float32_t3 v3 = vertices[3]; + result.edgeNormals[2] = cross(v2, v3); + + if (count > 4) + { + float32_t3 v4 = vertices[4]; + result.edgeNormals[3] = cross(v3, v4); + + if (count > 5) + { + float32_t3 v5 = vertices[5]; + result.edgeNormals[4] = cross(v4, v5); + + if (count > 6) + { + float32_t3 v6 = vertices[6]; + result.edgeNormals[5] = cross(v5, v6); + result.edgeNormals[6] = cross(v6, v0); + } + else + { + result.edgeNormals[5] = cross(v5, v0); + } + } + else + { + result.edgeNormals[4] = cross(v4, v0); + } + } + else + { + result.edgeNormals[3] = cross(v3, v0); + } + } + else + { + result.edgeNormals[2] = cross(v2, v0); + } + + return result; + } + + // Sign-bit AND reduction: dot <= 0 iff asuint(dot) sign bit set (modulo +0.0 + // exact-boundary samples, which never hit in practice). 6 ANDs on the INT + // pipe instead of 6 fmaxes on the FP pipe; lets the FP pipe stay busy with + // the 7 dot products on Ampere's split FP/INT scheduler. + bool isInside(float32_t3 dir) + { + const float32_t d0 = hlsl::dot(dir, edgeNormals[0]); + const float32_t d1 = hlsl::dot(dir, edgeNormals[1]); + const float32_t d2 = hlsl::dot(dir, edgeNormals[2]); + const float32_t d3 = hlsl::dot(dir, edgeNormals[3]); + const float32_t d4 = hlsl::dot(dir, edgeNormals[4]); + const float32_t d5 = hlsl::dot(dir, edgeNormals[5]); + const float32_t d6 = hlsl::dot(dir, edgeNormals[6]); + const uint32_t allNeg = asuint(d0) & asuint(d1) & asuint(d2) & asuint(d3) & asuint(d4) & asuint(d5) & asuint(d6); + return (allNeg & 0x80000000u) != 0u; + } + + // Transform edge normals from world-space to the pyramid's local frame in-place. + // After this, edgeNormals[i] = (dot(n, axis1), dot(n, axis2), dot(n, axis3)) + // and isInsideLocal() can do 2-FMA half-plane tests without extra storage. + // NOTE: destroys world-space normals, isInside() will no longer work correctly. + void transformToLocal(float32_t3 axis1, float32_t3 axis2, float32_t3 axis3) + { + NBL_UNROLL + for (uint32_t i = 0; i < MaxOBBSilhouetteVertices; i++) + { + float32_t3 n = edgeNormals[i]; + edgeNormals[i] = float32_t3(dot(n, axis1), dot(n, axis2), dot(n, axis3)); + } + } + + // 2D gnomonic containment test after transformToLocal(). + // dot(dir_unnorm, n_local) = localX * n.x + localY * n.y + n.z + bool isInsideLocal(float32_t localX, float32_t localY) + { + float32_t maxDot = localX * edgeNormals[0].x + localY * edgeNormals[0].y + edgeNormals[0].z; + maxDot = hlsl::max(maxDot, localX * edgeNormals[1].x + localY * edgeNormals[1].y + edgeNormals[1].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[2].x + localY * edgeNormals[2].y + edgeNormals[2].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[3].x + localY * edgeNormals[3].y + edgeNormals[3].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[4].x + localY * edgeNormals[4].y + edgeNormals[4].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[5].x + localY * edgeNormals[5].y + edgeNormals[5].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[6].x + localY * edgeNormals[6].y + edgeNormals[6].z); + return maxDot <= 0.0f; + } + + float32_t3 edgeNormals[MaxOBBSilhouetteVertices]; +}; + +} +} +} + +#endif diff --git a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl index 5895e4bc80..7ffe2ef407 100644 --- a/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl +++ b/include/nbl/builtin/hlsl/shapes/spherical_rectangle.hlsl @@ -94,6 +94,8 @@ struct SphericalRectangle vector4_type cosGamma; }; + // TODO: take an observer already, this way we can precompute and store the `r0`, `denorm_n_z` and `rcpLen_denorm_n_z` + // we need all of the above for solid angle and projected solid angle computation static SphericalRectangle create(NBL_CONST_REF_ARG(CompressedSphericalRectangle) compressed) { SphericalRectangle retval; @@ -102,7 +104,8 @@ struct SphericalRectangle retval.basis[0] = compressed.right / retval.extents[0]; retval.basis[1] = compressed.up / retval.extents[1]; assert(hlsl::abs(hlsl::dot(retval.basis[0], retval.basis[1])) < scalar_type(1e-5)); - retval.basis[2] = hlsl::normalize(hlsl::cross(retval.basis[0], retval.basis[1])); + // don't normalize, the `right` and `up` vectors are orthogonal and the two bases are normalized already! + retval.basis[2] = hlsl::cross(retval.basis[0], retval.basis[1]); return retval; } @@ -112,7 +115,8 @@ struct SphericalRectangle result.r0 = hlsl::mul(basis, origin - observer); const vector4_type denorm_n_z = vector4_type(-result.r0.y, result.r0.x + extents.x, result.r0.y + extents.y, -result.r0.x); - result.n_z = denorm_n_z * hlsl::rsqrt(hlsl::promote(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z); + const vector4_type rcpLen_denorm_n_z = hlsl::rsqrt(hlsl::promote(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z); + result.n_z = denorm_n_z * rcpLen_denorm_n_z; result.cosGamma = vector4_type( -result.n_z[0] * result.n_z[1], -result.n_z[1] * result.n_z[2], @@ -128,17 +132,17 @@ struct SphericalRectangle } // Kelvin-Stokes theorem: signed projected solid angle = integral_{rect} (n . omega) d_omega + // TODO: don't take the observer, observer should be taken at creation scalar_type projectedSolidAngle(const vector3_type observer, const vector3_type receiverNormal) NBL_CONST_MEMBER_FUNC { return projectedSolidAngleFromLocal(hlsl::mul(basis, origin - observer), hlsl::mul(basis, receiverNormal)); } - // Overload for when r0 and localNormal are already computed (avoids redundant mul(basis, ...)). - // Exploits rectangle structure: all 4 corners share the same z, so cross products - // have only 2 nonzero components each, and externalProducts can be computed without - // normalizing the corner directions. + // TODO: only take a `localN` scalar_type projectedSolidAngleFromLocal(const vector3_type r0, const vector3_type n) NBL_CONST_MEMBER_FUNC { + // FUN FACT: `n_z` already holds Z coordinate the NORMALIZED `awayFromEdgePlane`, the non-zero coordinate absolute value is equal to `r0.z * rcpLen_denorm_n_z` +// TODO: skip all this code until just call `acos` on the `unnormDots` const scalar_type x0 = r0.x, y0 = r0.y, z = r0.z; const scalar_type x1 = x0 + extents.x; const scalar_type y1 = y0 + extents.y; @@ -154,8 +158,9 @@ struct SphericalRectangle zSq + x1 * x1, zSq + y1 * y1, zSq + x0 * x0 - ); + ); // TODO: this is already computed as `rcpLen_denorm_n_z` +// TODO: this can be computed from `denorm_n_z`, `z` and `rcpLen_denorm_n_z` instead // dot(cross(ri,rj), n) / |cross(ri,rj)| the ex/ey scale factors cancel const vector4_type crossDotN = vector4_type( z * n.y - y0 * n.z, @@ -164,8 +169,12 @@ struct SphericalRectangle z * n.x - x0 * n.z ); // The ABS makes the computation correct for abs(cos(theta)) (BSDF projected solid angle). - const vector4_type externalProducts = hlsl::abs(crossDotN) * hlsl::rsqrt(crossLenSq); + const vector4_type externalProducts = crossDotN * hlsl::rsqrt(crossLenSq); +// TODO: isn't `rcpLen_denorm_n_z` related to the sin^-1() of arclengths ? Wouldn't `ACOS_CSC` apply instead of `acos*rsqrt(1-cos^2)` +// wouldn't then `hlsl::promote(result.r0.z * result.r0.z) + denorm_n_z * denorm_n_z` be the sin^2 ? +// it would probably have to be a different, here's the `acos(sqrt(1-x*x))/x` curve fit again revealed to me in a dream +// https://www.desmos.com/calculator/sbdrulot5a = exp2(-1.6*sqrt(1-x*x))*A+B // cos(arc length) between adjacent corners: dot(ri,rj) / (|ri|*|rj|) const vector4_type lenSq = vector4_type( x0 * x0 + y0 * y0, @@ -184,9 +193,11 @@ struct SphericalRectangle // rcpLen[i]*rcpLen[j] for adjacent pairs: (0,1), (1,2), (2,3), (3,0) const vector4_type cos_sides = unnormDots * rcpLen * rcpLen.yzwx; + // TODO: there's the same opportunity for optimization of this as the Spherical Triangle + // https://www.linkedin.com/posts/matt-kielan-9b054a165_untitled-graph-activity-7442910005671923712-jHz6?utm_source=share&utm_medium=member_desktop&rcm=ACoAACdp2RQBqq2bJfC2zxpsme-vRv2zh9oP-8E const vector4_type pyramidAngles = hlsl::acos(cos_sides); - return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5); + return hlsl::abs(hlsl::dot(pyramidAngles, externalProducts)) * scalar_type(0.5); } vector3_type origin; diff --git a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl index 1a5681b39e..b8a2c7229e 100644 --- a/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl +++ b/include/nbl/builtin/hlsl/shapes/spherical_triangle.hlsl @@ -20,6 +20,26 @@ namespace hlsl namespace shapes { +// TODO: move to where fast_acos lives +template +T acos_csc_approx(const T arg) +{ + const T u = hlsl::log2(_static_cast(1)+arg); + // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial + // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties: + // `F(0) = 0` and + // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy + // See https://www.desmos.com/calculator/sdptomhbju + // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments + T poly; + // TODO: actually optimize these constants in real world scenarios (renders) + if (order==1) + poly = (_static_cast(1)-u)*_static_cast(0.6); + else if (order==2) + poly = (_static_cast(1)-u)*_static_cast(0.637)+(_static_cast(1) - u * u) * _static_cast(0.0115); + return hlsl::exp2(poly); +} + template struct SphericalTriangle { @@ -54,6 +74,7 @@ struct SphericalTriangle // degenerate triangle: any side has near-zero sin, so csc blows up if (hlsl::any >(retval.csc_sides >= hlsl::promote(numeric_limits::max))) { + // TODO: can't do this, still need to be able to sample thin triangle like a line light, so need to know all the angles which are still valid retval.cos_vertices = hlsl::promote(0.0); retval.sin_vertices = hlsl::promote(0.0); retval.solid_angle = 0; @@ -86,35 +107,29 @@ struct SphericalTriangle if (solid_angle <= numeric_limits::epsilon) return 0; - matrix awayFromEdgePlane; - awayFromEdgePlane[0] = hlsl::cross(vertices[1], vertices[2]) * csc_sides[0]; - awayFromEdgePlane[1] = hlsl::cross(vertices[2], vertices[0]) * csc_sides[1]; - awayFromEdgePlane[2] = hlsl::cross(vertices[0], vertices[1]) * csc_sides[2]; + // `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross(A,B)*acos_csc_approx(dot(A,B))` +#define ACOS_CSC(I) acos_csc_approx(cos_sides[I]) +//#define ACOS_CSC(I) hlsl::acos(cos_sides[I])*csc_sides[I] + scalar_type externalProductsWeightedByPyramidAngles = hlsl::dot(hlsl::cross(vertices[1], vertices[2]),receiverNormal) * ACOS_CSC(0); + externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[2], vertices[0]),receiverNormal) * ACOS_CSC(1); + externalProductsWeightedByPyramidAngles += hlsl::dot(hlsl::cross(vertices[0], vertices[1]),receiverNormal) * ACOS_CSC(2); +#undef ACOS_CSC + // The ABS makes it so that the computation is correct for an `abs(cos(theta))` factor which is the projected solid angle used for a BSDF. + // It also makes the computation insensitive to the CW or CCW winding of the vertices in the triangle. // Proof: Kelvin-Stokes theorem, if you split the set into two along the horizon with constant CCW winding, the `cross` along the shared edge // goes in different directions and cancels out, while `acos` of the clipped great arcs corresponding to polygon edges add up to the original sides again. - const vector3_type externalProducts = hlsl::abs(hlsl::mul(/* transposed already */awayFromEdgePlane, receiverNormal)); - - // Far TODO: `cross(A,B)*acos(dot(A,B))/sin(1-dot^2)` can be done with `cross*acos_csc_approx(dot(A,B))` - // We could skip the `csc_sides` factor, and computing `pyramidAngles` and replace them with this approximation weighting before the dot product with the receiver notmal - // The curve fit "revealed in a dream" to me is `exp2(F(log2(x+1)))` where `F(u)` is a polynomial, so far I've calculated `F = (1-u)0.635+(1-u^2)0.0118` which gives <5% error until 165 degrees - // I have a feeling that a polynomial of ((Au+B)u+C)u+D could be sufficient if it has following properties: - // `F(0) = 0` and - // `F(u) <= log2(\frac{\cos^{-1}\left(2^{x}-1\right)}{\sqrt{1-\left(2^{x}-1\right)^{2}}})` because you want to consistently under-estimate the Projected Solid Angle to avoid creating energy - // See https://www.desmos.com/calculator/sdptomhbju - // Furthermore we could clip the polynomial calc to `Cu+D or `(Bu+C)u+D` for small arguments - const vector3_type pyramidAngles = hlsl::acos(cos_sides); - // So that triangle covering almost whole hemisphere sums to PI - return hlsl::dot(pyramidAngles, externalProducts) * scalar_type(0.5); + // The 0.5 is so that triangle covering almost whole hemisphere sums to PI + return externalProductsWeightedByPyramidAngles * scalar_type(0.5); } vector3_type vertices[3]; // angles of vertices with origin, so the sides are INSIDE the sphere vector3_type cos_sides; - vector3_type csc_sides; + vector3_type csc_sides; // TODO: spherical triangle sampling only needs `csc_sides[1]` and possibly `csc_sides[2]` // angles between arcs on the sphere, so angles in the TANGENT plane at each vertex vector3_type cos_vertices; - vector3_type sin_vertices; + vector3_type sin_vertices; // TODO: spherical triangle sampling only needs `sin_vertices[0]` a.k.a `sinA` scalar_type solid_angle; }; diff --git a/include/nbl/core/sampling/RandomSampler.h b/include/nbl/core/sampling/RandomSampler.h index 39832dc8f1..b692ef5e08 100644 --- a/include/nbl/core/sampling/RandomSampler.h +++ b/include/nbl/core/sampling/RandomSampler.h @@ -11,8 +11,8 @@ namespace nbl::core { -class RandomSampler -{ + class RandomSampler + { public: RandomSampler(uint32_t _seed) { @@ -25,9 +25,24 @@ class RandomSampler return mersenneTwister(); } + // Returns a float in [0, 1) + inline float nextFloat() + { + // 1 / 2^32 + constexpr float norm = 1.0f / 4294967296.0f; + return mersenneTwister() * norm; + } + + // Returns a float in [min, max) + inline float nextFloat(float min, float max) + { + constexpr float norm = 1.0f / 4294967296.0f; + return min + (mersenneTwister() * norm) * (max - min); + } + protected: std::mt19937 mersenneTwister; -}; + }; } diff --git a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h index 39013417dc..53cef98d71 100644 --- a/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h +++ b/include/nbl/ext/FullScreenTriangle/FullScreenTriangle.h @@ -24,11 +24,18 @@ struct ProtoPipeline final const video::IGPURenderpass* renderpass, const uint32_t subpassIx=0, asset::SBlendParams blendParams = {}, + asset::SRasterizationParams rasterizationParams = DefaultRasterParams, const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform=hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT, video::IGPUPipelineCache* pipelineCache = nullptr ); core::smart_refctd_ptr m_vxShader; + + constexpr static inline asset::SRasterizationParams DefaultRasterParams = { + .faceCullingMode = asset::EFCM_NONE, + .depthWriteEnable = false, + .depthCompareOp = asset::ECO_ALWAYS + }; }; bool recordDrawCall(video::IGPUCommandBuffer* commandBuffer); diff --git a/include/nbl/video/IGPUPipeline.h b/include/nbl/video/IGPUPipeline.h index aeae8c866a..6e2d299cec 100644 --- a/include/nbl/video/IGPUPipeline.h +++ b/include/nbl/video/IGPUPipeline.h @@ -12,6 +12,8 @@ #include "nbl/asset/IPipeline.h" #include "nbl/system/to_string.h" +#include + namespace nbl::video { @@ -128,6 +130,58 @@ class IGPUPipelineBase { using SShaderEntryMap = SShaderSpecInfo::entry_map_t; + // One row of VK_KHR_pipeline_executable_properties statistics, kept as the driver + // returned it (no formatting, no string scraping). The string field on + // SExecutableInfo below is still populated -- this is a parallel, structured view + // for callers that need numbers (benchmarks, regressions, baseline diffs). + struct SExecutableStatistic + { + enum class FORMAT : uint8_t { BOOL32, INT64, UINT64, FLOAT64 }; + std::string name; + std::string description; + FORMAT format = FORMAT::UINT64; + union Value + { + bool b32; + int64_t i64; + uint64_t u64; + double f64; + } value = {}; + + // Convenience: collapse to a uint64_t regardless of the original format. + // Matches what most consumers want -- counters, sizes, register tallies. + inline uint64_t asUint() const + { + switch (format) + { + case FORMAT::BOOL32: return value.b32 ? 1u : 0u; + case FORMAT::INT64: return value.i64 < 0 ? 0u : uint64_t(value.i64); + case FORMAT::UINT64: return value.u64; + case FORMAT::FLOAT64: return value.f64 < 0.0 ? 0u : uint64_t(value.f64); + } + return 0u; + } + }; + + // One IR (e.g. SPIR-V, AMD ISA, NV SASS) returned by the driver. Text payloads + // are stored without the trailing NUL the driver adds; binary payloads are kept + // verbatim. Lets callers dump to a file or hash the data without re-parsing the + // pretty-printed blob. + struct SInternalRepresentation + { + std::string name; + std::string description; + bool isText = false; + core::vector data; + + // View text payloads as a string_view without copying. Empty for binary IRs. + inline std::string_view asText() const + { + if (!isText || data.empty()) return {}; + return std::string_view(reinterpret_cast(data.data()), data.size()); + } + }; + // Per-executable info from VK_KHR_pipeline_executable_properties struct SExecutableInfo { @@ -135,8 +189,10 @@ class IGPUPipelineBase { std::string description; core::bitflag stages = hlsl::ShaderStage::ESS_UNKNOWN; uint32_t subgroupSize = 0; - std::string statistics; - std::string internalRepresentations; + std::string statistics; // human-readable, aligned columns; what users log today + core::vector structuredStatistics; // same data, structured; for programmatic use + std::string internalRepresentations; // human-readable concatenation of IRs (textual ones inline, binaries as "[binary data, N bytes]") + core::vector structuredInternalRepresentations; // same data, structured; for programmatic use (file dump, hashing, etc.) }; inline std::span getExecutableInfo() const { return m_executableInfo; } diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 12b4af1bef..a6985b35d1 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -269,6 +269,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/spherical_triangle.hls LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/spherical_rectangle.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/aabb.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/obb.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/shapes/obb_silhouette.hlsl") #sampling LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/basic.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/linear.hlsl") @@ -281,6 +282,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_triangle.h LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_triangle.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_rectangle.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/projected_spherical_rectangle.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/spherical_pyramid.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/cos_weighted_spheres.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_pdf.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/sampling/quotient_and_weight.hlsl") diff --git a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp index 58b1f2ea84..55dea2eb00 100644 --- a/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp +++ b/src/nbl/ext/FullScreenTriangle/CFullScreenTriangle.cpp @@ -84,6 +84,7 @@ smart_refctd_ptr ProtoPipeline::createPipeline( const IGPURenderpass* renderpass, const uint32_t subpassIx, SBlendParams blendParams, + asset::SRasterizationParams rasterizationParams, const hlsl::SurfaceTransform::FLAG_BITS swapchainTransform, IGPUPipelineCache* pipelineCache) { @@ -94,11 +95,6 @@ smart_refctd_ptr ProtoPipeline::createPipeline( smart_refctd_ptr m_retval; { - constexpr SRasterizationParams defaultRasterParams = { - .faceCullingMode = EFCM_NONE, - .depthWriteEnable = false, - .depthCompareOp = ECO_ALWAYS - }; const auto orientationAsUint32 = static_cast(swapchainTransform); IGPUPipelineBase::SShaderEntryMap specConstants; @@ -111,7 +107,7 @@ smart_refctd_ptr ProtoPipeline::createPipeline( params[0].cached = { .vertexInput = {}, // The Full Screen Triangle doesn't use any HW vertex input state .primitiveAssembly = {}, - .rasterization = defaultRasterParams, + .rasterization = rasterizationParams, .blend = blendParams, .subpassIx = subpassIx }; diff --git a/src/nbl/video/CVulkanPipelineExecutableInfo.h b/src/nbl/video/CVulkanPipelineExecutableInfo.h index 4cdf1f194f..79f1d8d573 100644 --- a/src/nbl/video/CVulkanPipelineExecutableInfo.h +++ b/src/nbl/video/CVulkanPipelineExecutableInfo.h @@ -6,6 +6,8 @@ #include +#include + namespace nbl::video { @@ -16,7 +18,8 @@ inline void populateExecutableInfoFromVulkan(core::vectorvk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, nullptr); + if (vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, nullptr) != VK_SUCCESS) + return; if (executableCount == 0) return; @@ -24,7 +27,8 @@ inline void populateExecutableInfoFromVulkan(core::vector properties(executableCount); for (uint32_t i = 0; i < executableCount; ++i) properties[i] = {VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_PROPERTIES_KHR, nullptr}; - vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, properties.data()); + if (vk->vk.vkGetPipelineExecutablePropertiesKHR(vkDevice, &pipelineInfo, &executableCount, properties.data()) != VK_SUCCESS) + return; outInfo.resize(executableCount); @@ -43,53 +47,75 @@ inline void populateExecutableInfoFromVulkan(core::vectorvk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, nullptr); + if (vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, nullptr) != VK_SUCCESS) + statCount = 0; if (statCount > 0) { core::vector stats(statCount); for (uint32_t s = 0; s < statCount; ++s) stats[s] = {VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_STATISTIC_KHR, nullptr}; - vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, stats.data()); + if (vk->vk.vkGetPipelineExecutableStatisticsKHR(vkDevice, &execInfo, &statCount, stats.data()) != VK_SUCCESS) + statCount = 0; - // First pass: format name:value pairs and find max width for alignment - core::vector nameValues(statCount); - size_t maxNameValueLen = 0; - for (uint32_t s = 0; s < statCount; ++s) + if (statCount > 0) { - const auto& stat = stats[s]; - std::string value; - switch (stat.format) + info.structuredStatistics.resize(statCount); + + // First pass: format name:value pairs (for the human-readable string) and + // fill structuredStatistics in lockstep so callers can pick whichever view + // they need without re-parsing. + core::vector nameValues(statCount); + size_t maxNameValueLen = 0; + for (uint32_t s = 0; s < statCount; ++s) { - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR: - value = stat.value.b32 ? "true" : "false"; - break; - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR: - value = std::to_string(stat.value.i64); - break; - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR: - value = std::to_string(stat.value.u64); - break; - case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR: - value = std::to_string(stat.value.f64); - break; - default: - value = ""; - break; + const auto& stat = stats[s]; + auto& outStat = info.structuredStatistics[s]; + outStat.name = stat.name; + outStat.description = stat.description; + + std::string value; + switch (stat.format) + { + case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR: + outStat.format = IGPUPipelineBase::SExecutableStatistic::FORMAT::BOOL32; + outStat.value.b32 = stat.value.b32 != VK_FALSE; + value = outStat.value.b32 ? "true" : "false"; + break; + case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR: + outStat.format = IGPUPipelineBase::SExecutableStatistic::FORMAT::INT64; + outStat.value.i64 = stat.value.i64; + value = std::to_string(stat.value.i64); + break; + case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR: + outStat.format = IGPUPipelineBase::SExecutableStatistic::FORMAT::UINT64; + outStat.value.u64 = stat.value.u64; + value = std::to_string(stat.value.u64); + break; + case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR: + outStat.format = IGPUPipelineBase::SExecutableStatistic::FORMAT::FLOAT64; + outStat.value.f64 = stat.value.f64; + value = std::to_string(stat.value.f64); + break; + default: + // Unknown format: leave structured value zero, keep raw text marker + value = ""; + break; + } + nameValues[s] = std::string(stat.name) + ": " + value; + maxNameValueLen = std::max(maxNameValueLen, nameValues[s].size()); } - nameValues[s] = std::string(stat.name) + ": " + value; - maxNameValueLen = std::max(maxNameValueLen, nameValues[s].size()); - } - // Second pass: emit with aligned columns - std::string& statsStr = info.statistics; - for (uint32_t s = 0; s < statCount; ++s) - { - statsStr += nameValues[s]; - statsStr.append(maxNameValueLen - nameValues[s].size() + 4, ' '); - statsStr += "// "; - statsStr += stats[s].description; - statsStr += "\n"; + // Second pass: emit with aligned columns (unchanged human-readable format) + std::string& statsStr = info.statistics; + for (uint32_t s = 0; s < statCount; ++s) + { + statsStr += nameValues[s]; + statsStr.append(maxNameValueLen - nameValues[s].size() + 4, ' '); + statsStr += "// "; + statsStr += stats[s].description; + statsStr += "\n"; + } } } @@ -97,7 +123,8 @@ inline void populateExecutableInfoFromVulkan(core::vectorvk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, nullptr); + if (vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, nullptr) != VK_SUCCESS) + irCount = 0; if (irCount > 0) { @@ -106,7 +133,8 @@ inline void populateExecutableInfoFromVulkan(core::vectorvk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data()); + if (vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data()) != VK_SUCCESS) + continue; // Allocate data buffers and second call to get data core::vector> irData(irCount); @@ -116,26 +144,41 @@ inline void populateExecutableInfoFromVulkan(core::vectorvk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data()); + if (vk->vk.vkGetPipelineExecutableInternalRepresentationsKHR(vkDevice, &execInfo, &irCount, irs.data()) != VK_SUCCESS) + continue; + + info.structuredInternalRepresentations.resize(irCount); std::string& irStr = info.internalRepresentations; for (uint32_t r = 0; r < irCount; ++r) { + auto& outIr = info.structuredInternalRepresentations[r]; + outIr.name = irs[r].name; + outIr.description = irs[r].description; + outIr.isText = irs[r].isText != VK_FALSE; + // Text payloads include a trailing NUL per the spec; drop it from the + // structured copy so asText().size() matches the textual length. + const size_t rawSize = irs[r].dataSize; + const size_t copySize = outIr.isText && rawSize > 0 ? rawSize - 1 : rawSize; + outIr.data.resize(copySize); + if (copySize > 0) + std::memcpy(outIr.data.data(), irs[r].pData, copySize); + irStr += "---- "; irStr += irs[r].name; irStr += " ----\n"; irStr += irs[r].description; irStr += "\n"; - if (irs[r].isText) + if (outIr.isText) { auto* str = static_cast(irs[r].pData); - irStr.append(str, irs[r].dataSize > 0 ? irs[r].dataSize - 1 : 0); + irStr.append(str, copySize); irStr += "\n"; } else { irStr += "[binary data, "; - irStr += std::to_string(irs[r].dataSize); + irStr += std::to_string(rawSize); irStr += " bytes]\n"; } }