diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 64a06b16f..1597b145e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -182,9 +182,7 @@ struct ShapeSampling const vector3_type tri_vertices[3] = {tri.vertex0, tri.vertex1, tri.vertex2}; shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri_vertices, ray.origin); sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st, ray.normalAtOrigin, ray.wasBSDFAtOrigin); - const scalar_type pdf = pst.backwardPdf(L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdf < numeric_limits::max ? pdf : numeric_limits::max; + return pst.backwardWeight(L); } template @@ -252,6 +250,7 @@ template struct ShapeSampling { using scalar_type = T; + using vector2_type = vector; using vector3_type = vector; static ShapeSampling create(NBL_CONST_REF_ARG(Shape) rect) @@ -268,49 +267,58 @@ struct ShapeSampling matrix rectNormalBasis; vector rectExtents; rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0; sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - scalar_type solidAngle = sphR0.solidAngle(ray.origin).value; - if (solidAngle > numeric_limits::min) - pdf = 1.f / solidAngle; - else - pdf = bit_cast(numeric_limits::infinity); - return pdf; + + // 1.f/0.f gives infinity no special checks needed + return 1.f / sphR0.solidAngle(ray.origin).value; } template vector3_type generateAndPdfAndWeight(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) weight, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi) { - const vector3_type N = rect.getNormalTimesArea(); - const vector3_type origin2origin = rect.offset - origin; - matrix rectNormalBasis; vector rectExtents; rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0; sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - vector3_type L = hlsl::promote(0.0); + // sampling::SphericalRectangle ssph = sampling::SphericalRectangle::create(sphR0, origin); - if ( ssph.solidAngle > numeric_limits::min) + typename sampling::SphericalRectangle::cache_type cache; + + vector3_type L = hlsl::promote(0.0); + const bool FastVersion = true; + if (FastVersion) { - typename sampling::SphericalRectangle::cache_type cache; - const vector3_type localDir = ssph.generate(xi.xy, cache); - // not sure if generate() can produce NaN/inf when solidAngle > min - assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir))); - // transform local direction to world space - L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2]; - pdf = ssph.forwardPdf(xi.xy, cache); - weight = ssph.forwardWeight(xi.xy, cache); + // actually the slowest + //L = ssph.generate(xi.xy, cache); + //newRayMaxT = ssph.computeHitT(L); + + // fastest + const vector3_type localL = ssph.generateNormalizedLocal(xi.xy,cache,newRayMaxT); + assert(!hlsl::any(hlsl::isinf(localL) || hlsl::isnan(localL))); + L = hlsl::mul(hlsl::transpose(ssph.basis),localL); } else - weight = bit_cast(numeric_limits::infinity); + { + L = ssph.generateUnnormalized(xi.xy,cache); + assert(!hlsl::any(hlsl::isinf(L) || hlsl::isnan(L))); + const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L)); + newRayMaxT = 1.f / rcpLen; + L *= rcpLen; + } + // prevent self intersections against the emitter + newRayMaxT -= 0.0001f; - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + pdf = ssph.forwardPdf(xi.xy,cache); + weight = ssph.forwardWeight(xi.xy,cache); return L; } @@ -329,7 +337,6 @@ struct EffectivePolygonMethod NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE; }; - // Projected solid angle NEE for rectangles using "Practical Warps": // bilinear warp over 4-corner NdotL + spherical rectangle sampling. // Same grazing-angle limitations as the triangle variant -- see comments @@ -359,21 +366,12 @@ struct ShapeSampling sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; sampling::ProjectedSphericalRectangle psr = sampling::ProjectedSphericalRectangle::create(sphR0, ray.origin, ray.normalAtOrigin, ray.wasBSDFAtOrigin); - // Reconstruct normalized [0,1]^2 position on the rectangle from the ray direction - const vector3_type N = rect.getNormalTimesArea(); - const scalar_type t = hlsl::dot(N, rect.offset - ray.origin) / hlsl::dot(N, ray.direction); - const vector3_type hitPoint = ray.origin + ray.direction * t; - const vector3_type localHit = hitPoint - rect.offset; - const vector p = vector(hlsl::dot(localHit, rectNormalBasis[0]) / rectExtents.x, hlsl::dot(localHit, rectNormalBasis[1]) / rectExtents.y); - const scalar_type pdf = psr.backwardPdf(p); - return pdf < numeric_limits::max ? pdf : numeric_limits::max; + return psr.backwardWeight(ray.direction); } template vector3_type generateAndPdfAndWeight(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) weight, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi) { - const vector3_type N = rect.getNormalTimesArea(); - const vector3_type origin2origin = rect.offset - origin; matrix rectNormalBasis; vector rectExtents; @@ -382,25 +380,37 @@ struct ShapeSampling sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - vector3_type L = hlsl::promote(0.0); sampling::ProjectedSphericalRectangle psr = sampling::ProjectedSphericalRectangle::create(sphR0, origin, interaction.getN(), interaction.isMaterialBSDF()); - const scalar_type solidAngle = psr.sphrect.solidAngle; - if (solidAngle > numeric_limits::min) + typename sampling::ProjectedSphericalRectangle::cache_type cache; + + vector3_type L = hlsl::promote(0.0); + const bool FastVersion = true; + if (FastVersion) { - typename sampling::ProjectedSphericalRectangle::cache_type cache; - const vector3_type localDir = psr.generate(xi.xy, cache); - // not sure if generate() can produce NaN/inf when solidAngle > min - assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir))); - // transform local direction to world space - L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2]; - pdf = psr.forwardPdf(xi.xy, cache); - weight = psr.forwardWeight(xi.xy, cache); + // actually the slowest + //L = psr.generate(xi.xy, cache); + //newRayMaxT = psr.sphrect.computeHitT(L); + + // fastest + const vector3_type localL = psr.generateNormalizedLocal(xi.xy,cache,newRayMaxT); + assert(!hlsl::any(hlsl::isinf(localL) || hlsl::isnan(localL))); + // hopefully CSE kicks in for the `UsePdfAsWeight==true` + L = hlsl::mul(hlsl::transpose(psr.sphrect.basis),localL); } else - weight = bit_cast(numeric_limits::infinity); - // TODO: `improved_spherical_rect` branch merge - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + { + L = psr.generateUnnormalized(xi.xy,cache); + assert(!hlsl::any(hlsl::isinf(L) || hlsl::isnan(L))); + const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L)); + newRayMaxT = 1.f / rcpLen; + L *= rcpLen; + } + // prevent self intersections against the emitter + newRayMaxT -= 0.0001f; + + pdf = psr.forwardPdf(xi.xy,cache); + weight = psr.forwardWeight(xi.xy,cache); return L; } diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 4668580bd..749c2787e 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -439,7 +439,7 @@ class HLSLComputePathtracer final : public SimpleWindowedApplication, public Bui nullptr, nullptr ); - m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass(), 0u, {}, hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT, m_pipelineCache.object.get()); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass(), 0u, {}, {}, hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT, m_pipelineCache.object.get()); if (!m_presentPipeline) return logFail("Could not create Graphics Pipeline!"); m_pipelineCache.dirty = true; diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt index 2ac238c33..78e3ab319 100644 --- a/37_HLSLSamplingTests/CMakeLists.txt +++ b/37_HLSLSamplingTests/CMakeLists.txt @@ -26,7 +26,7 @@ set(DEPENDS app_resources/shaders/projected_spherical_triangle_test.comp.hlsl app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl app_resources/shaders/spherical_rectangle_test.comp.hlsl - app_resources/shaders/alias_table_test.comp.hlsl + app_resources/shaders/packed_alias_test.comp.hlsl app_resources/shaders/cumulative_probability_test.comp.hlsl app_resources/common/linear.hlsl app_resources/common/uniform_hemisphere.hlsl @@ -42,6 +42,7 @@ set(DEPENDS app_resources/common/concentric_mapping.hlsl app_resources/common/polar_mapping.hlsl app_resources/common/discrete_sampler_bench.hlsl + app_resources/common/sampler_bench_pc.hlsl app_resources/common/alias_table.hlsl app_resources/common/cumulative_probability.hlsl ) @@ -91,7 +92,7 @@ endif() set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") -set(BENCH_ITERS 2048) +set(BENCH_ITERS 128) set(WORKGROUP_SIZE 64) target_compile_definitions(${EXECUTABLE_NAME} PRIVATE @@ -99,7 +100,7 @@ target_compile_definitions(${EXECUTABLE_NAME} PRIVATE WORKGROUP_SIZE=${WORKGROUP_SIZE} ) -set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\", \"-DWORKGROUP_SIZE=${WORKGROUP_SIZE}\"") +set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\"") set(JSON " [ @@ -113,8 +114,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\", - \"KEY\": \"linear_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"linear_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\", + \"KEY\": \"linear_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", @@ -122,8 +128,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", - \"KEY\": \"uniform_hemisphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"uniform_hemisphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", + \"KEY\": \"uniform_hemisphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", @@ -131,8 +142,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", - \"KEY\": \"uniform_sphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"uniform_sphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", + \"KEY\": \"uniform_sphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", @@ -140,8 +156,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", - \"KEY\": \"projected_hemisphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_hemisphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", + \"KEY\": \"projected_hemisphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", @@ -149,8 +170,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", - \"KEY\": \"projected_sphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_sphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", + \"KEY\": \"projected_sphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", @@ -158,8 +184,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", - \"KEY\": \"spherical_triangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"spherical_triangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", + \"KEY\": \"spherical_triangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", + \"KEY\": \"spherical_triangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", @@ -167,8 +203,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", - \"KEY\": \"concentric_mapping_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"concentric_mapping_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", + \"KEY\": \"concentric_mapping_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", @@ -176,8 +217,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", - \"KEY\": \"polar_mapping_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"polar_mapping_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", + \"KEY\": \"polar_mapping_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", @@ -185,8 +231,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", - \"KEY\": \"bilinear_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"bilinear_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", + \"KEY\": \"bilinear_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", @@ -194,8 +245,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", - \"KEY\": \"box_muller_transform_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"box_muller_transform_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", + \"KEY\": \"box_muller_transform_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", @@ -203,8 +259,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", - \"KEY\": \"projected_spherical_triangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_spherical_triangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_triangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_triangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", @@ -212,8 +278,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", - \"KEY\": \"projected_spherical_rectangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_spherical_rectangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_rectangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_rectangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", @@ -221,18 +297,68 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", - \"KEY\": \"spherical_rectangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"spherical_rectangle_bench_1_1_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_1_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_1_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_R0_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_R0_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_SA_EXTENTS\"] }, { - \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", - \"KEY\": \"alias_table_test\" + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"] }, { - \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", - \"KEY\": \"alias_table_bench\", + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_a_test\" + }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_b_test\", + \"COMPILE_OPTIONS\": [\"-DNBL_PACKED_ALIAS_B\"] + }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_a_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}] }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_b_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_PACKED_ALIAS_B\"] + }, { \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_test\" @@ -241,6 +367,16 @@ set(JSON " \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + }, + { + \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", + \"KEY\": \"cumulative_probability_yolo_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"] + }, + { + \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", + \"KEY\": \"cumulative_probability_eytzinger_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_EYTZINGER\"] } ] ") @@ -250,7 +386,7 @@ NBL_CREATE_NSC_COMPILE_RULES( LINK_TO ${EXECUTABLE_NAME} BINARY_DIR ${OUTPUT_DIRECTORY} MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT - COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 + COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 -DWORKGROUP_SIZE=${WORKGROUP_SIZE} OUTPUT_VAR KEYS INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp NAMESPACE nbl::this_example::builtin::build diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl index da7048a1f..08706408f 100644 --- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl @@ -8,12 +8,28 @@ using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t AliasTestTableSize = 4; +// Log2N = ceil_log2(N) minimises quantisation drift on the stayProb unorm +// (here 30 unorm bits, essentially lossless). +NBL_CONSTEXPR uint32_t AliasTestLog2N = 2; -using AliasTestProbAccessor = ArrayAccessor; -using AliasTestAliasAccessor = ArrayAccessor; -using AliasTestPdfAccessor = ArrayAccessor; +using AliasTestPdfAccessor = ArrayAccessor; +using AliasTestPackedWordAccessor = ArrayAccessor; -using AliasTestSampler = sampling::AliasTable; +// Dedicated struct-valued accessor for PackedAliasEntryB. Field-wise copy +// sidesteps HLSL's struct functional-cast ambiguity. +struct AliasTestEntryBAccessor +{ + using value_type = sampling::PackedAliasEntryB; + + template + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val.packedWord = data[i].packedWord; + val.ownPdf = data[i].ownPdf; + } + + value_type data[AliasTestTableSize]; +}; struct AliasTableInputValues { @@ -22,32 +38,64 @@ struct AliasTableInputValues struct AliasTableTestResults { - uint32_t generatedIndex; + uint32_t generatedIndex; float32_t forwardPdf; float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; // Pre-computed alias table for weights {1, 2, 3, 4}: -// pdf = {0.1, 0.2, 0.3, 0.4} -// prob = {0.4, 0.8, 1.0, 0.8} -// alias = {3, 3, 2, 2} -struct AliasTableTestExecutor +// pdf = {0.1, 0.2, 0.3, 0.4} +// stayProb = {0.4, 0.8, 1.0, 0.8} +// alias = {3, 3, 2, 2} +// +// Log2N = 2 unorm encoding (30 bits for stayProb, 2 bits for alias): +// packedWord = (alias & 0x3) | (round(stayProb * ((1u<<30) - 1)) << 2) +// bin 0: (3) | (429496729 << 2) = 0x66666667 +// bin 1: (3) | (858993458 << 2) = 0xCCCCCCCB +// bin 2: (2) | (1073741823 << 2) = 0xFFFFFFFE +// bin 3: (2) | (858993458 << 2) = 0xCCCCCCCA + +struct PackedAliasATestExecutor +{ + void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output) + { + AliasTestPackedWordAccessor wordAcc; + wordAcc.data[0] = 0x66666667u; + wordAcc.data[1] = 0xCCCCCCCBu; + wordAcc.data[2] = 0xFFFFFFFEu; + wordAcc.data[3] = 0xCCCCCCCAu; + + AliasTestPdfAccessor pdfAcc; + pdfAcc.data[0] = 0.1f; + pdfAcc.data[1] = 0.2f; + pdfAcc.data[2] = 0.3f; + pdfAcc.data[3] = 0.4f; + + using Sampler = sampling::PackedAliasTableA; + Sampler sampler = Sampler::create(wordAcc, pdfAcc, AliasTestTableSize); + + Sampler::cache_type cache; + output.generatedIndex = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.backwardPdf = sampler.backwardPdf(output.generatedIndex); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + } +}; + +struct PackedAliasBTestExecutor { void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output) { - AliasTestProbAccessor probAcc; - probAcc.data[0] = 0.4f; - probAcc.data[1] = 0.8f; - probAcc.data[2] = 1.0f; - probAcc.data[3] = 0.8f; - - AliasTestAliasAccessor aliasAcc; - aliasAcc.data[0] = 3u; - aliasAcc.data[1] = 3u; - aliasAcc.data[2] = 2u; - aliasAcc.data[3] = 2u; + AliasTestEntryBAccessor entryAcc; + entryAcc.data[0].packedWord = 0x66666667u; entryAcc.data[0].ownPdf = 0.1f; + entryAcc.data[1].packedWord = 0xCCCCCCCBu; entryAcc.data[1].ownPdf = 0.2f; + entryAcc.data[2].packedWord = 0xFFFFFFFEu; entryAcc.data[2].ownPdf = 0.3f; + entryAcc.data[3].packedWord = 0xCCCCCCCAu; entryAcc.data[3].ownPdf = 0.4f; AliasTestPdfAccessor pdfAcc; pdfAcc.data[0] = 0.1f; @@ -55,14 +103,16 @@ struct AliasTableTestExecutor pdfAcc.data[2] = 0.3f; pdfAcc.data[3] = 0.4f; - AliasTestSampler sampler = AliasTestSampler::create(probAcc, aliasAcc, pdfAcc, AliasTestTableSize); + using Sampler = sampling::PackedAliasTableB; + Sampler sampler = Sampler::create(entryAcc, pdfAcc, AliasTestTableSize); - AliasTestSampler::cache_type cache; - output.generatedIndex = sampler.generate(input.u, cache); - output.forwardPdf = sampler.forwardPdf(input.u, cache); - output.backwardPdf = sampler.backwardPdf(output.generatedIndex); - output.forwardWeight = sampler.forwardWeight(input.u, cache); - output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + Sampler::cache_type cache; + output.generatedIndex = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.backwardPdf = sampler.backwardPdf(output.generatedIndex); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl index 1f0a68195..5e679c98a 100644 --- a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl @@ -12,7 +12,6 @@ struct ArrayAccessor using value_type = T; template void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(data[i]); } - T operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { return data[i]; } T data[N]; }; diff --git a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl index 64a13d3e1..752e547ce 100644 --- a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -19,6 +20,7 @@ struct BilinearTestResults float32_t forwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct BilinearTestExecutor @@ -37,6 +39,10 @@ struct BilinearTestExecutor output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); } + // marginFactor = 3: same reasoning as Linear; Bilinear is two Linear stages, so the skewed- + // coefficient inverse-CDF d^2/du^2 divergence near [0,1]^2 boundary applies on both axes. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); + } }; diff --git a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl index e8247e259..2b86e8560 100644 --- a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,6 +22,7 @@ struct BoxMullerTransformTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t2 separateBackwardPdf; + float32_t jacobianProduct; }; struct BoxMullerTransformTestExecutor @@ -40,6 +42,7 @@ struct BoxMullerTransformTestExecutor output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); output.separateBackwardPdf = sampler.separateBackwardPdf(output.generated); + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl index 67d8e5869..e0c6a570c 100644 --- a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct ConcentricMappingTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t jacobianProduct; + float32_t inverseJacobianPdf; float32_t2 roundtripError; }; @@ -39,7 +41,15 @@ struct ConcentricMappingTestExecutor output.backwardWeight = sampling::ConcentricMapping::backwardWeight(input.u); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf; + { + sampling::ConcentricMapping sampler; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + // Disk-center singularity: concentric atan2 blows up as r->0. + const float32_t diskRadius = nbl::hlsl::length(output.mapped); + output.inverseJacobianPdf = diskRadius < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f); + } } }; diff --git a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl index f58a22741..e66cb44fe 100644 --- a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl @@ -24,6 +24,7 @@ struct CumProbTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; // Pre-computed CDF table for weights {1, 2, 3, 4}: @@ -46,6 +47,7 @@ struct CumProbTestExecutor output.backwardPdf = sampler.backwardPdf(output.generatedIndex); output.forwardWeight = sampler.forwardWeight(input.u, cache); output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl index 9f1fec422..198b72faf 100644 --- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl @@ -5,23 +5,22 @@ using namespace nbl::hlsl; -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE; -struct AliasTablePushConstants +struct CumProbPushConstants { - uint64_t probAddress; // float probability[N] - uint64_t aliasAddress; // uint32_t alias[N] - uint64_t pdfAddress; // float pdf[N] + uint64_t cumProbAddress; // float cumProb[N-1] uint64_t outputAddress; // uint32_t acc[threadCount] uint32_t tableSize; // N }; -struct CumProbPushConstants +// Variants A and B both take the entry array plus a separate pdf[] array +// (A: 4 B words, B: 8 B {packedWord, ownPdf}; pdf[] has the same contents in +// both but is tapped independently by the sampler). +struct PackedAliasABPushConstants { - uint64_t cumProbAddress; // float cumProb[N-1] + uint64_t entriesAddress; // A: uint32_t words[N] (4 B); B: PackedAliasEntryB[N] (8 B) + uint64_t pdfAddress; // float pdf[N] uint64_t outputAddress; // uint32_t acc[threadCount] uint32_t tableSize; // N }; diff --git a/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl new file mode 100644 index 000000000..f949f5b86 --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl @@ -0,0 +1,264 @@ +#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_ + +#include +#include + +using namespace nbl::hlsl; + +// Negative sentinels signal "skipped" to the host verifier; the value encodes the reason. +static const float32_t JACOBIAN_SKIP_U_DOMAIN = -1.0f; +static const float32_t JACOBIAN_SKIP_CREASE = -2.0f; +static const float32_t JACOBIAN_SKIP_HEMI_BOUNDARY = -3.0f; +static const float32_t JACOBIAN_SKIP_BWD_PDF_RANGE = -4.0f; +static const float32_t JACOBIAN_SKIP_CODOMAIN_SINGULARITY = -5.0f; + + +template +struct ForwardJacobianMeasure; + +// Signed step that stays inside [0,1]: flip direction when u is in the upper half so u +/- eps +// never overshoots the domain. Magnitude is what matters (the stencil results take abs/length). +template +T signedEps(T u, T eps) +{ + return u > T(0.5) ? -eps : eps; +} + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + cache_type c; + const codomain_type L_x = _sampler.generate(u + signedEps(u, eps), c); + return nbl::hlsl::abs(L_x - L) / eps; + } +}; + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + domain_type u_x = u; + u_x[0] += signedEps(u[0], eps); + domain_type u_y = u; + u_y[1] += signedEps(u[1], eps); + cache_type c; + const codomain_type L_x = _sampler.generate(u_x, c); + const codomain_type L_y = _sampler.generate(u_y, c); + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(L_x - L, L_y - L)); + return nbl::hlsl::abs(det) / (eps * eps); + } +}; + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + domain_type u_x = u; + u_x[0] += signedEps(u[0], eps); + domain_type u_y = u; + u_y[1] += signedEps(u[1], eps); + cache_type c; + const codomain_type L_x = _sampler.generate(u_x, c); + const codomain_type L_y = _sampler.generate(u_y, c); + return nbl::hlsl::length(nbl::hlsl::cross(L_x - L, L_y - L)) / (eps * eps); + } +}; + +// 3D domain: stencil perturbs u[0] and u[1] only, so the (2,3) body applies unchanged. +template +struct ForwardJacobianMeasure : ForwardJacobianMeasure +{ +}; + + +template +struct DomainMarginCheck; + +template +struct DomainMarginCheck +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + static bool outsideMargin(domain_type u, scalar_type margin) + { + return u < margin || u > scalar_type(1) - margin; + } +}; + +template +struct DomainMarginCheck +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + static bool outsideMargin(domain_type u, scalar_type margin) + { + return u[0] < margin || u[0] > scalar_type(1) - margin || u[1] < margin || u[1] > scalar_type(1) - margin; + } +}; + +// 3D domain: forward stencil only perturbs u[0] and u[1], so u[2] is irrelevant and (2) applies. +template +struct DomainMarginCheck : DomainMarginCheck +{ +}; + +enum JacobianMode : uint32_t +{ + JACOBIAN_PLAIN = 0, + JACOBIAN_CONCENTRIC = 1, // + concentric crease skip + JACOBIAN_CONCENTRIC_UXFOLD = 2 // + crease + u.x=0.5 hemi-boundary skip +}; + +// marginFactor scales the u-domain skip to marginFactor * eps. Use > 1 only for samplers whose +// stencil bias extends past a single eps-step (e.g. Arvo spherical triangle: sinZ ~ sqrt(u.y) +// gives O(h/u.y) forward-diff bias, so u.y in [0, k*eps] must be skipped). +template +float32_t computeJacobianProduct(Sampler _sampler, typename Sampler::domain_type u, float32_t eps, float32_t marginFactor) +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + NBL_IF_CONSTEXPR(Mode != JACOBIAN_PLAIN) + { + // Cast via float32_t2 so this block typechecks for scalar / vec2 / vec3 domains alike + // (HLSL splats scalars, identity on vec2, .xy on vec3). 1D samplers never reach here. + const float32_t2 uxy = (float32_t2)u; + const float32_t ux = uxy.x; + const float32_t uy = uxy.y; + + NBL_IF_CONSTEXPR(Mode == JACOBIAN_CONCENTRIC_UXFOLD) + { + if (nbl::hlsl::abs(ux - float32_t(0.5)) <= float32_t(2e-3)) + return JACOBIAN_SKIP_HEMI_BOUNDARY; + } + + const bool uxFold = (Mode == JACOBIAN_CONCENTRIC_UXFOLD); + // Empirical: the concentric C0 crease's stencil bias spreads wider than the 2*eps geometric + // straddle band. Non-uxFold 6e-3 covers the disk-center residual for Projected samplers; + // uxFold 1e-2 accounts for the doubled local_ux rate when u.x is folded. + const float32_t creaseBand = uxFold ? float32_t(1e-2) : float32_t(6e-3); + const float32_t local_ux = uxFold ? nbl::hlsl::abs(float32_t(2) * ux - float32_t(1)) : ux; + const float32_t a = float32_t(2) * local_ux - float32_t(1); + const float32_t b = float32_t(2) * uy - float32_t(1); + if (nbl::hlsl::abs(nbl::hlsl::abs(a) - nbl::hlsl::abs(b)) <= creaseBand) + return JACOBIAN_SKIP_CREASE; + } + + using margin_check_type = DomainMarginCheck::Dimension>; + if (margin_check_type::outsideMargin(u, scalar_type(eps * marginFactor))) + return JACOBIAN_SKIP_U_DOMAIN; + + // Generate on a copy: some samplers mutate u through NBL_REF_ARG (e.g. ProjectedSphere + // consumes u.z for hemisphere selection), and the perturbations below need the original u. + cache_type cache; + domain_type uGen = u; + const codomain_type L = _sampler.generate(uGen, cache); + const scalar_type pdf = _sampler.forwardPdf(uGen, cache); + + using measure_type = ForwardJacobianMeasure::Dimension, vector_traits::Dimension>; + const scalar_type measure = measure_type::compute(_sampler, u, scalar_type(eps), L); + + return pdf * measure; +} + + +template +struct InverseJacobianMeasure; + +template +struct InverseJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps) + { + const scalar_type twoEps = scalar_type(2) * eps; + codomain_type x0_lo = x; + x0_lo[0] -= eps; + codomain_type x0_hi = x; + x0_hi[0] += eps; + codomain_type x1_lo = x; + x1_lo[1] -= eps; + codomain_type x1_hi = x; + x1_hi[1] += eps; + domain_type u0_lo = _sampler.generateInverse(x0_lo); + domain_type u0_hi = _sampler.generateInverse(x0_hi); + domain_type u1_lo = _sampler.generateInverse(x1_lo); + domain_type u1_hi = _sampler.generateInverse(x1_hi); + const domain_type dudx0 = (u0_hi - u0_lo) / twoEps; + const domain_type dudx1 = (u1_hi - u1_lo) / twoEps; + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(dudx0, dudx1)); + return nbl::hlsl::abs(det); + } +}; + +template +struct InverseJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps) + { + const scalar_type twoEps = scalar_type(2) * eps; + codomain_type t1, t2; + const codomain_type up = nbl::hlsl::abs(x[2]) < scalar_type(0.999) + ? codomain_type(scalar_type(0), scalar_type(0), scalar_type(1)) + : codomain_type(scalar_type(1), scalar_type(0), scalar_type(0)); + t1 = nbl::hlsl::normalize(nbl::hlsl::cross(up, x)); + t2 = nbl::hlsl::cross(x, t1); + domain_type u_t1_lo = _sampler.generateInverse(nbl::hlsl::normalize(x - t1 * eps)); + domain_type u_t1_hi = _sampler.generateInverse(nbl::hlsl::normalize(x + t1 * eps)); + domain_type u_t2_lo = _sampler.generateInverse(nbl::hlsl::normalize(x - t2 * eps)); + domain_type u_t2_hi = _sampler.generateInverse(nbl::hlsl::normalize(x + t2 * eps)); + const domain_type dudt1 = (u_t1_hi - u_t1_lo) / twoEps; + const domain_type dudt2 = (u_t2_hi - u_t2_lo) / twoEps; + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(dudt1, dudt2)); + return nbl::hlsl::abs(det); + } +}; + +template +float32_t computeInverseJacobianPdf(Sampler _sampler, typename Sampler::codomain_type sample, float32_t backwardPdf, float32_t pdfMin, float32_t pdfMax) +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + if (backwardPdf < scalar_type(pdfMin) || backwardPdf > scalar_type(pdfMax)) + return JACOBIAN_SKIP_BWD_PDF_RANGE; + + using measure_type = InverseJacobianMeasure::Dimension, vector_traits::Dimension>; + const scalar_type eps = scalar_type(1e-3); + return measure_type::compute(_sampler, sample, eps); +} + +#endif diff --git a/37_HLSLSamplingTests/app_resources/common/linear.hlsl b/37_HLSLSamplingTests/app_resources/common/linear.hlsl index b27d88e5b..af269ad2f 100644 --- a/37_HLSLSamplingTests/app_resources/common/linear.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/linear.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -19,6 +20,7 @@ struct LinearTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct LinearTestExecutor @@ -37,6 +39,7 @@ struct LinearTestExecutor output.backwardPdf = _sampler.backwardPdf(output.generated); output.backwardWeight = _sampler.backwardWeight(output.generated); } + output.jacobianProduct = computeJacobianProduct(_sampler, input.u, 1e-3f, 3.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl index 82e020fdc..e4b8ffabb 100644 --- a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct PolarMappingTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t jacobianProduct; + float32_t inverseJacobianPdf; float32_t2 roundtripError; }; @@ -39,7 +41,23 @@ struct PolarMappingTestExecutor output.backwardWeight = sampling::PolarMapping::backwardWeight(input.u); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf; + + { + sampling::PolarMapping sampler; + // marginFactor = 3: r = sqrt(u.x) gives O(h/u.x) forward-diff bias near u.x=0, so skip + // u.x within 3*eps of the domain boundary (same reasoning as Linear's skewed-density case). + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); + // Two inverse singularities: + // - disk center: atan2 diverges as r -> 0 + // - atan2 branch cut at y=0, x>0: the stencil's +/-eps in y straddles the 2*pi wrap, + // producing du.y/eps ~ 1/eps spikes (seen as test values ~305-862 with eps=1e-3). + const float32_t polarRadius = nbl::hlsl::length(output.mapped); + const bool onCutBand = nbl::hlsl::abs(output.mapped.y) < 5e-3f && output.mapped.x > 0.0f; + output.inverseJacobianPdf = (polarRadius < 0.1f || onCutBand) + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f); + } + } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl index 9697cf0df..c48697b03 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct ProjectedHemisphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct ProjectedHemisphereTestExecutor @@ -43,7 +45,11 @@ struct ProjectedHemisphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 5.0f); + const float32_t phDiskR = nbl::hlsl::length((float32_t2)output.generated); + output.inverseJacobianPdf = phDiskR < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 1e-3f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl index e9886b61d..a78a937f6 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct ProjectedSphereTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct ProjectedSphereTestExecutor @@ -38,6 +40,7 @@ struct ProjectedSphereTestExecutor } output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 5.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl index 8370952ca..4aed7d9c3 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -24,12 +25,10 @@ struct ProjectedSphericalRectangleTestResults float32_t2 surfaceOffset; float32_t3 referenceDirection; float32_t forwardPdf; - float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; - float32_t backwardPdfAtGenerated; - float32_t backwardWeightAtGenerated; float32_t2 extents; + float32_t jacobianProduct; }; struct ProjectedSphericalRectangleTestExecutor @@ -46,30 +45,29 @@ struct ProjectedSphericalRectangleTestExecutor output.extents = rect.extents; sampling::ProjectedSphericalRectangle::cache_type cache; + output.generated = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + // backwardWeight now takes a 3D direction; evaluate at generated L. + output.backwardWeight = sampler.backwardWeight(output.generated); + + float32_t2 absXY; { - output.generated = sampler.generate(input.u, cache); - output.forwardPdf = sampler.forwardPdf(input.u, cache); - output.forwardWeight = sampler.forwardWeight(input.u, cache); - } - { - sampling::ProjectedSphericalRectangle::cache_type offsetCache; - output.surfaceOffset = sampler.generateSurfaceOffset(input.u, offsetCache); + typename sampling::Bilinear::cache_type bc; + const float32_t2 warped = sampler.bilinearPatch.generate(input.u, bc); + typename sampling::SphericalRectangle::cache_type sphrectCache; + absXY = sampler.sphrect.generateLocalBasisXY(warped, sphrectCache); + output.surfaceOffset = absXY - float32_t2(sampler.sphrect.r0.x, sampler.sphrect.r0.y); } - // reference direction: reconstruct local 3D point from surfaceOffset and normalize { - const float32_t3 localPoint = sampler.sphrect.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0)); - output.referenceDirection = nbl::hlsl::normalize(localPoint); + const float32_t3 localPoint = float32_t3(absXY.x, absXY.y, sampler.sphrect.r0.z); + const float32_t3 localDir = nbl::hlsl::normalize(localPoint); + output.referenceDirection = sampler.sphrect.basis[0] * localDir[0] + + sampler.sphrect.basis[1] * localDir[1] + + sampler.sphrect.basis[2] * localDir[2]; } - // Test backwardPdf/Weight at the rect center: a deterministic interior point - // that avoids amplifying generate's FP errors through backward evaluation. - const float32_t2 center = float32_t2(0.5, 0.5); - output.backwardPdf = sampler.backwardPdf(center); - output.backwardWeight = sampler.backwardWeight(center); - // Use cache.warped (the [0,1]^2 input to the spherical rect warp) for consistency - // checks, NOT generated/extents (the nonlinear warp output). The bilinear in - // forwardPdf evaluates at cache.warped, so backwardPdf must too. - output.backwardPdfAtGenerated = sampler.backwardPdf(cache.warped); - output.backwardWeightAtGenerated = sampler.backwardWeight(cache.warped); + + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl index 5c81e53e0..0c424590b 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,11 +22,10 @@ struct ProjectedSphericalTriangleTestResults { float32_t3 generated; float32_t forwardPdf; - float32_t backwardPdf; - float32_t backwardPdfAtGenerated; float32_t forwardWeight; float32_t backwardWeight; float32_t backwardWeightAtGenerated; + float32_t jacobianProduct; }; struct ProjectedSphericalTriangleTestExecutor @@ -43,15 +43,20 @@ struct ProjectedSphericalTriangleTestExecutor output.forwardPdf = sampler.forwardPdf(input.u, cache); output.forwardWeight = sampler.forwardWeight(input.u, cache); } - // Test backwardPdf/Weight at the triangle centroid: a deterministic interior point computed - // from only basic arithmetic + sqrt (IEEE 754 exact), so CPU and GPU agree bit-exactly. - // Using output.generated would amplify generate's transcendental FP errors through - // generateInverse's acos, producing CPU/GPU divergence. const float32_t3 center = nbl::hlsl::normalize(input.vertex0 + input.vertex1 + input.vertex2); - output.backwardPdf = sampler.backwardPdf(center); output.backwardWeight = sampler.backwardWeight(center); - output.backwardPdfAtGenerated = sampler.backwardPdf(output.generated); output.backwardWeightAtGenerated = sampler.backwardWeight(output.generated); + // Check the bilinear-warped (inner) u directly: for skinny triangles with a strongly biased + // receiver normal, outer u well inside [0,1] can still warp to inner u <~ 0.02 where Arvo's + // sqrt(sinZ) noise dominates. Pre-skip on the inner u instead of padding an outer marginFactor. + sampling::Bilinear::cache_type bc; + const float32_t2 innerU = sampler.bilinearPatch.generate(input.u, bc); + const float32_t innerMargin = 0.02f; + const bool innerNearEdge = innerU.x < innerMargin || innerU.x > (1.0f - innerMargin) + || innerU.y < innerMargin || innerU.y > (1.0f - innerMargin); + output.jacobianProduct = innerNearEdge + ? JACOBIAN_SKIP_U_DOMAIN + : computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl b/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl new file mode 100644 index 000000000..ab357e504 --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/common/sampler_bench_pc.hlsl @@ -0,0 +1,15 @@ +#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_SAMPLER_BENCH_PC_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_SAMPLER_BENCH_PC_INCLUDED_ + +#include + +// Implicit-output benchmark push constants. Every sampler bench shader writes +// one uint32_t accumulator per thread to outputAddress[invID]; nothing reads it +// back -- the goal is to keep the optimiser from eliding the sampling work. +// Mirrors the BDA convention from discrete_sampler_bench.hlsl. +struct SamplerBenchPushConstants +{ + uint64_t outputAddress; +}; + +#endif diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl index 9ae4df256..68159405a 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,11 +22,17 @@ struct SphericalRectangleTestResults float32_t3 generated; float32_t2 surfaceOffset; float32_t3 referenceDirection; + float32_t3 normalizedLocal; + float32_t hitDist; + float32_t3 unnormalized; + float32_t computedHitT; + float32_t3 normalizedLocalToWorld; float32_t forwardPdf; float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; float32_t2 extents; + float32_t jacobianProduct; }; struct SphericalRectangleTestExecutor @@ -47,17 +54,36 @@ struct SphericalRectangleTestExecutor output.forwardPdf = sampler.forwardPdf(input.u, cache); output.forwardWeight = sampler.forwardWeight(input.u, cache); } + float32_t2 absXY; { sampling::SphericalRectangle::cache_type cache; - output.surfaceOffset = sampler.generateSurfaceOffset(input.u, cache); + absXY = sampler.generateLocalBasisXY(input.u, cache); + output.surfaceOffset = absXY - float32_t2(sampler.r0.x, sampler.r0.y); } - // reference direction: reconstruct local 3D point from surfaceOffset and normalize { - const float32_t3 localPoint = sampler.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0)); - output.referenceDirection = nbl::hlsl::normalize(localPoint); + const float32_t3 localDir = nbl::hlsl::normalize(float32_t3(absXY.x, absXY.y, sampler.r0.z)); + output.referenceDirection = sampler.basis[0] * localDir[0] + + sampler.basis[1] * localDir[1] + + sampler.basis[2] * localDir[2]; } + { + sampling::SphericalRectangle::cache_type cache; + output.normalizedLocal = sampler.generateNormalizedLocal(input.u, cache, output.hitDist); + output.normalizedLocalToWorld = sampler.basis[0] * output.normalizedLocal[0] + + sampler.basis[1] * output.normalizedLocal[1] + + sampler.basis[2] * output.normalizedLocal[2]; + } + { + sampling::SphericalRectangle::cache_type cache; + output.unnormalized = sampler.generateUnnormalized(input.u, cache); + } + output.computedHitT = sampler.computeHitT(output.generated); + output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); + // marginFactor = 3: __generate's sin_au denominator goes through catastrophic cancellation + // for u.x within ~2*eps of 0 or 1 (au near n*pi), leaving ~0.5% residual at factor 3. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl index 291661629..d3cd09326 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -24,6 +25,7 @@ struct SphericalTriangleTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; // Minimum signed distance to a triangle edge (sin of angular distance to nearest great circle). // Positive = inside, negative = outside. Allows tolerance at boundaries. float32_t generatedInside; @@ -39,7 +41,7 @@ struct SphericalTriangleTestExecutor const float32_t3 verts[3] = { input.vertex0, input.vertex1, input.vertex2 }; shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); // Forward: u -> v { @@ -58,9 +60,7 @@ struct SphericalTriangleTestExecutor } // Roundtrip error: ||u - u'|| output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - - // Jacobian product: (1/forwardPdf) * backwardPdf should equal 1 for bijective samplers - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 20.0f); // Domain preservation: // A point is inside the spherical triangle iff it is on the "inside" half-plane @@ -79,6 +79,13 @@ struct SphericalTriangleTestExecutor float32_t2 u = output.inverted; output.invertedInDomain = nbl::hlsl::min(nbl::hlsl::min(u.x, float32_t(1.0) - u.x), nbl::hlsl::min(u.y, float32_t(1.0) - u.y)); + + const float32_t uMargin = 1e-2f; + const bool nearUBoundary = output.inverted.x < uMargin || output.inverted.x > (1.0f - uMargin) + || output.inverted.y < uMargin || output.inverted.y > (1.0f - uMargin); + output.inverseJacobianPdf = nearUBoundary + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.1f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl index 76a724774..8541bef19 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -14,7 +15,6 @@ struct UniformHemisphereInputValues struct UniformHemisphereTestResults { float32_t3 generated; - float32_t pdf; float32_t2 inverted; float32_t forwardPdf; float32_t backwardPdf; @@ -22,6 +22,7 @@ struct UniformHemisphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct UniformHemisphereTestExecutor @@ -42,7 +43,11 @@ struct UniformHemisphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + const float32_t uhDiskR = nbl::hlsl::length((float32_t2)output.generated); + output.inverseJacobianPdf = uhDiskR < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl index 3780b82ef..fb4086e44 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -14,7 +15,6 @@ struct UniformSphereInputValues struct UniformSphereTestResults { float32_t3 generated; - float32_t pdf; float32_t2 inverted; float32_t forwardPdf; float32_t backwardPdf; @@ -22,6 +22,7 @@ struct UniformSphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct UniformSphereTestExecutor @@ -43,7 +44,12 @@ struct UniformSphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + const float32_t usDiskR = nbl::hlsl::length((float32_t2)output.generated); + const float32_t absZ = nbl::hlsl::abs(output.generated.z); + output.inverseJacobianPdf = (absZ < 0.1f || usDiskR < 0.1f) + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl deleted file mode 100644 index 72c4f1977..000000000 --- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl +++ /dev/null @@ -1,77 +0,0 @@ -#pragma shader_stage(compute) - -#include - -#ifdef BENCH_ITERS -#include "../common/discrete_sampler_bench.hlsl" -#include - -[[vk::push_constant]] AliasTablePushConstants pc; - -struct BdaProbabilityAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -struct BdaAliasIndexAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -struct BdaPdfAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -using BenchAliasTable = sampling::AliasTable; -#else -#include "../common/alias_table.hlsl" - -[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; -[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; -#endif - -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif -[numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] -void main() -{ - const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; - -#ifdef BENCH_ITERS - BdaProbabilityAccessor probAcc; - probAcc.addr = pc.probAddress; - BdaAliasIndexAccessor aliasAcc; - aliasAcc.addr = pc.aliasAddress; - BdaPdfAccessor pdfAcc; - pdfAcc.addr = pc.pdfAddress; - BenchAliasTable sampler = BenchAliasTable::create(probAcc, aliasAcc, pdfAcc, pc.tableSize); - - float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); - NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; - uint32_t acc = 0u; - uint32_t accPdf = 0u; - - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) - { - float32_t u = frac(xi + float32_t(i) * goldenRatio); - BenchAliasTable::cache_type cache; - uint32_t generated = sampler.generate(u, cache); - acc ^= generated; - accPdf ^= asuint(sampler.forwardPdf(u, cache)); - } - - vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc + accPdf); -#else - AliasTableTestExecutor executor; - executor(inputTestValues[invID], outputTestValues[invID]); -#endif -} diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl index 06aad4fdc..420cbcd0b 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl @@ -5,37 +5,42 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb coefficients by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation; - sampling::Bilinear sampler = sampling::Bilinear::create(coeffs); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::Bilinear::cache_type cache; - float32_t2 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation; + sampling::Bilinear sampler = sampling::Bilinear::create(coeffs); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::Bilinear::cache_type cache; + float32_t2 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else BilinearTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl index cf0f4065a..3302db2e9 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl @@ -5,37 +5,42 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb stddev by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - sampling::BoxMullerTransform sampler = sampling::BoxMullerTransform::create(1.0f + perturbation); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - u.x = max(u.x, 1e-7f); - sampling::BoxMullerTransform::cache_type cache; - float32_t2 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + sampling::BoxMullerTransform sampler = sampling::BoxMullerTransform::create(1.0f + perturbation); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + u.x = max(u.x, 1e-7f); + sampling::BoxMullerTransform::cache_type cache; + float32_t2 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else BoxMullerTransformTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl index 973aba4fe..058c3ef11 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl @@ -5,17 +5,18 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -23,15 +24,19 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ConcentricMapping::cache_type cache; - float32_t2 generated = sampling::ConcentricMapping::generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ConcentricMapping::cache_type cache; + float32_t2 generated = sampling::ConcentricMapping::generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ConcentricMappingTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl index 2e48adc4a..f06613b49 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl @@ -12,13 +12,18 @@ struct BdaCumProbAccessor { using value_type = float32_t; template - void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad(addr + uint64_t(sizeof(value_type)) * uint64_t(i))); } - value_type operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { value_type v; get(i, v); return v; } + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad(addr + uint64_t(sizeof(value_type)) * uint64_t(i), sizeof(value_type))); } uint64_t addr; }; -using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#if defined(NBL_CUMPROB_EYTZINGER) +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#elif defined(NBL_CUMPROB_YOLO_READS) +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#else +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#endif #else #include "../common/cumulative_probability.hlsl" @@ -26,11 +31,7 @@ using BenchCumProbSampler = sampling::CumulativeProbabilitySampler outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -46,10 +47,10 @@ void main() for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t u = frac(xi + float32_t(i) * goldenRatio); + xi = frac(xi + goldenRatio); BenchCumProbSampler::cache_type cache; - uint32_t generated = sampler.generate(u, cache); - acc ^= generated ^ asuint(sampler.forwardPdf(u, cache)); + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); } vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl index 614f339b4..acf0887e5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl @@ -5,37 +5,42 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb coefficients by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation; - sampling::Linear sampler = sampling::Linear::create(coeffs); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t u = float32_t(rng()) * toFloat; - sampling::Linear::cache_type cache; - float32_t generated = sampler.generate(u, cache); - acc ^= asuint(generated); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation; + sampling::Linear sampler = sampling::Linear::create(coeffs); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t u = float32_t(rng()) * toFloat; + sampling::Linear::cache_type cache; + float32_t generated = sampler.generate(u, cache); + acc ^= asuint(generated); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else LinearTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl new file mode 100644 index 000000000..b0dbeedac --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl @@ -0,0 +1,114 @@ +#pragma shader_stage(compute) + +#include + +#ifdef BENCH_ITERS +#include "../common/discrete_sampler_bench.hlsl" +#include + +[[vk::push_constant]] PackedAliasABPushConstants pc; + +// Log2N bucket. Covers all sweep sizes up to 2^LOG2N buckets without precision +// loss. The same value must be passed to the host-side packA() / +// packB() call so the bit layouts match. +NBL_CONSTEXPR uint32_t LOG2N_BUCKET = 26; + +// Variant A accessor: 4 B packed words. +struct BdaPackedWordAccessor +{ + using value_type = uint32_t; + + template && is_integral_v) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V)); + } + + uint64_t addr; +}; + +// Variant B accessor: 8 B PackedAliasEntryB. Loads a uint2 and decomposes it +// into the POD entry so DXC never sees a bitfield — avoids the Insert/Extract +// round-trip we observed when the sampler read from a bitfield struct. +struct BdaPackedAliasBAccessor +{ + using value_type = nbl::hlsl::sampling::PackedAliasEntryB; + + template) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + const uint64_t loadAddr = addr + uint64_t(8u) * uint64_t(i); + const uint2 raw = vk::RawBufferLoad(loadAddr, 8u); + val.packedWord = raw.x; + val.ownPdf = asfloat(raw.y); + } + + uint64_t addr; +}; + +// Separate 4 B pdf[] accessor. +struct BdaPdfAccessor +{ + using value_type = float32_t; + + template && is_integral_v) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V)); + } + + uint64_t addr; +}; + +#ifdef NBL_PACKED_ALIAS_B +using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableB; +#else +using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableA; +#endif + +#else +#include "../common/alias_table.hlsl" + +[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; +[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; +#endif + +[numthreads(WORKGROUP_SIZE, 1, 1)] +void main() +{ + const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + +#ifdef BENCH_ITERS +#ifdef NBL_PACKED_ALIAS_B + BdaPackedAliasBAccessor entryAcc; +#else + BdaPackedWordAccessor entryAcc; +#endif + entryAcc.addr = pc.entriesAddress; + BdaPdfAccessor pdfAcc; + pdfAcc.addr = pc.pdfAddress; + BenchPackedAlias sampler = BenchPackedAlias::create(entryAcc, pdfAcc, pc.tableSize); + + float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); + NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; + uint32_t acc = 0u; + + [loop] + for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + { + xi = frac(xi + goldenRatio); + BenchPackedAlias::cache_type cache; + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); + } + + vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); +#else +#ifdef NBL_PACKED_ALIAS_B + PackedAliasBTestExecutor executor; +#else + PackedAliasATestExecutor executor; +#endif + executor(inputTestValues[invID], outputTestValues[invID]); +#endif +} diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl index db7488acd..b12b276e3 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl @@ -5,17 +5,18 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -23,15 +24,19 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::PolarMapping::cache_type cache; - float32_t2 generated = sampling::PolarMapping::generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::PolarMapping::cache_type cache; + float32_t2 generated = sampling::PolarMapping::generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else PolarMappingTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl index 871444955..9be02b9fd 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl @@ -5,17 +5,18 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -23,16 +24,20 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::ProjectedHemisphere sampler; - sampling::ProjectedHemisphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedHemisphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedHemisphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl index 67a3fa662..7488dc2d5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl @@ -5,17 +5,18 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -23,16 +24,20 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat; sampling::ProjectedSphere sampler; - sampling::ProjectedSphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat; + sampling::ProjectedSphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedSphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl index 903075804..dd7f62db4 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl @@ -5,42 +5,69 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total). +// Set to 1 for 1:1, 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS. +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void -main() +void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS // Perturb rectangle origin by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - shapes::CompressedSphericalRectangle compressed; - compressed.origin = float32_t3(perturbation, perturbation, -2.0f); - compressed.right = float32_t3(1.0f, 0.0f, 0.0f); - compressed.up = float32_t3(0.0f, 1.0f, 0.0f); - shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); - sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(perturbation, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ProjectedSphericalRectangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + // Depend on i so the compiler can't hoist create() out of the loop. + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + // Read a cheap function of sampler state so create() can't be elided. + sampling::ProjectedSphericalRectangle::cache_type pdfCache; + sampler.generate(float32_t2(0.5f, 0.5f), pdfCache); + acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache)); } - benchOutput.Store(invID * 4u, acc); +#else + // Unified create:generate loop — one create per BENCH_SAMPLES_PER_CREATE generates. + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedSphericalRectangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } + } +#endif + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedSphericalRectangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl index 83e47b3e1..9ed69291a 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl @@ -5,39 +5,57 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb vertices and normal by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; - shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ProjectedSphericalTriangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + sampling::ProjectedSphericalTriangle::cache_type pdfCache; + sampler.generate(float32_t2(0.5f, 0.5f), pdfCache); + acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache)); } - benchOutput.Store(invID * 4u, acc); +#else + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedSphericalTriangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } + } +#endif + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else ProjectedSphericalTriangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl index 3e9a6fcae..8cba7fbcb 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl @@ -5,42 +5,115 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total). +// Set to 1 for 1:1 (create+generate per iter), 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS. +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void -main() +void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb rectangle origin by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - shapes::CompressedSphericalRectangle compressed; - compressed.origin = float32_t3(perturbation, perturbation, -2.0f); - compressed.right = float32_t3(1.0f, 0.0f, 0.0f); - compressed.up = float32_t3(0.0f, 1.0f, 0.0f); - shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); - sampling::SphericalRectangle sampler = sampling::SphericalRectangle::create(rect, float32_t3(perturbation, 0.0f, 0.0f)); + // Observer at origin so origin - observer = (p, p, -2) has no zero components: + // keeps all 4 denorm_n_z components perturbation-dependent (no constant-folding). + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; + +#if (defined(BENCH_VARIANT_SA_EXTENTS) || defined(BENCH_VARIANT_R0_EXTENTS)) && !defined(BENCH_CREATE_ONLY) + // variants 2/3 pre-build: produce a rect (for its basis, sa, extents) once per thread. + shapes::CompressedSphericalRectangle compressedBase; + compressedBase.origin = float32_t3(perturbationBase, perturbationBase, -2.0f); + compressedBase.right = float32_t3(1.0f, 0.0f, 0.0f); + compressedBase.up = float32_t3(0.0f, 1.0f, 0.0f); + const shapes::SphericalRectangle rectBase = shapes::SphericalRectangle::create(compressedBase); + const typename shapes::SphericalRectangle::solid_angle_type saBase = rectBase.solidAngle(float32_t3(0.0f, 0.0f, 0.0f)); + const float32_t2 extentsBase = rectBase.extents; + const matrix basisBase = rectBase.basis; +#endif nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::SphericalRectangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + // Depend on i so the compiler can't hoist create() out of the loop. + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + sampling::SphericalRectangle sampler; + #if defined(BENCH_VARIANT_SA_EXTENTS) + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + typename shapes::SphericalRectangle::solid_angle_type sa = rect.solidAngle(float32_t3(0.0f, 0.0f, 0.0f)); + sampler = sampling::SphericalRectangle::create(rect.basis, sa, rect.extents); + #elif defined(BENCH_VARIANT_R0_EXTENTS) + // Build a basis from the same rect geometry so create(basis, r0, extents) has the right frame. + shapes::CompressedSphericalRectangle compressedR0; + compressedR0.origin = float32_t3(perturbation, perturbation, -2.0f); + compressedR0.right = float32_t3(1.0f, 0.0f, 0.0f); + compressedR0.up = float32_t3(0.0f, 1.0f, 0.0f); + const shapes::SphericalRectangle rectR0 = shapes::SphericalRectangle::create(compressedR0); + const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f); + const float32_t2 extents = float32_t2(1.0f, 1.0f); + sampler = sampling::SphericalRectangle::create(rectR0.basis, r0, extents); + #else + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampler = sampling::SphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f)); + #endif + // Read a cheap function of sampler state so create() can't be elided. + acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f))); } - benchOutput.Store(invID * 4u, acc); +#else + // Unified create:generate loop - one create per BENCH_SAMPLES_PER_CREATE generates. + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + sampling::SphericalRectangle sampler; + #if defined(BENCH_VARIANT_SA_EXTENTS) + // variant 2: create(basis, sa, extents). Poison one cosGamma so the sincos_accumulator can't be hoisted. + typename shapes::SphericalRectangle::solid_angle_type sa = saBase; + sa.cosGamma[2] += perturbation; + sampler = sampling::SphericalRectangle::create(basisBase, sa, extentsBase); + #elif defined(BENCH_VARIANT_R0_EXTENTS) + // variant 3: create(basis, r0, extents). r0 matches what variant 1 produces. + const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f); + const float32_t2 extents = float32_t2(1.0f, 1.0f); + sampler = sampling::SphericalRectangle::create(basisBase, r0, extents); + #else + // variant 1 (default): create(shape, observer). + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampler = sampling::SphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f)); + #endif + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::SphericalRectangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } + } +#endif + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else SphericalRectangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl index 55991bcb3..14b4843b9 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl @@ -5,39 +5,56 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb vertices by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; - shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::SphericalTriangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f))); + } +#else + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::SphericalTriangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); +#endif + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else SphericalTriangleTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl index 908520243..3c832e995 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl @@ -1,4 +1,8 @@ +#pragma shader_stage(compute) + // Compile test: instantiate all sampling types and their concept-required methods to verify DXC compilation +#include +#include #include #include #include @@ -9,12 +13,15 @@ #include #include #include +#include +#include +#include +#include "../common/array_accessor.hlsl" using namespace nbl::hlsl; [[vk::binding(0, 0)]] RWStructuredBuffer output; [numthreads(1, 1, 1)] -[shader("compute")] void main() { float32_t2 u2 = float32_t2(0.5, 0.5); @@ -119,7 +126,7 @@ void main() // Octant triangle: all dot products between vertices are 0, so cos_sides=0, csc_sides=1 const float32_t3 triVerts[3] = {float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)}; shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::createFromUnitSphereVertices(triVerts); - sampling::SphericalTriangle sphTri = sampling::SphericalTriangle::create(shapeTri); + sampling::SphericalTriangle sphTri = sampling::SphericalTriangle::create(shapeTri); sampling::SphericalTriangle::cache_type sphTriCache; float32_t3 stSample = sphTri.generate(u2, sphTriCache); acc.xyz += stSample; @@ -129,7 +136,7 @@ void main() acc.x += sphTri.backwardPdf(stSample); acc.x += sphTri.backwardWeight(stSample); - // SphericalRectangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight + // SphericalRectangle — generate, generateSurfaceOffset, forwardPdf, backwardPdf, forwardWeight, backwardWeight shapes::CompressedSphericalRectangle csr; csr.origin = float32_t3(0.0, 0.0, -1.0); csr.right = float32_t3(1.0, 0.0, 0.0); @@ -140,20 +147,71 @@ void main() sampling::SphericalRectangle::cache_type sphRectCache; float32_t3 srSample = sphRect.generate(u2, sphRectCache); acc.xyz += srSample; + acc.xy += sphRect.generateLocalBasisXY(u2, sphRectCache); acc.x += sphRect.forwardPdf(u2, sphRectCache); acc.x += sphRect.forwardWeight(u2, sphRectCache); acc.x += sphRect.backwardPdf(srSample); acc.x += sphRect.backwardWeight(srSample); - // ProjectedSphericalTriangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight + // ProjectedSphericalTriangle — generate, forwardPdf, forwardWeight, backwardWeight(L) sampling::ProjectedSphericalTriangle projTri = sampling::ProjectedSphericalTriangle::create(shapeTri, float32_t3(0.0, 0.0, 1.0), false); sampling::ProjectedSphericalTriangle::cache_type projTriCache; float32_t3 ptSample = projTri.generate(u2, projTriCache); acc.xyz += ptSample; acc.x += projTri.forwardPdf(u2, projTriCache); acc.x += projTri.forwardWeight(u2, projTriCache); - acc.x += projTri.backwardPdf(ptSample); acc.x += projTri.backwardWeight(ptSample); + // ProjectedSphericalRectangle (UsePdfAsWeight=true) — generate, forwardPdf, forwardWeight, backwardWeight(L) + const float32_t3 psrNormal = float32_t3(0.0, 0.0, 1.0); + sampling::ProjectedSphericalRectangle projRectPdf = + sampling::ProjectedSphericalRectangle::create(shapeRect, srObserver, psrNormal, false); + sampling::ProjectedSphericalRectangle::cache_type projRectPdfCache; + float32_t3 prPdfSample = projRectPdf.generate(u2, projRectPdfCache); + acc.xyz += prPdfSample; + acc.x += projRectPdf.forwardPdf(u2, projRectPdfCache); + acc.x += projRectPdf.forwardWeight(u2, projRectPdfCache); + acc.x += projRectPdf.backwardWeight(prPdfSample); + + // ProjectedSphericalRectangle (UsePdfAsWeight=false) — exercise the MIS-weight path + sampling::ProjectedSphericalRectangle projRectMis = + sampling::ProjectedSphericalRectangle::create(shapeRect, srObserver, psrNormal, true); + sampling::ProjectedSphericalRectangle::cache_type projRectMisCache; + float32_t3 prMisSample = projRectMis.generate(u2, projRectMisCache); + acc.xyz += prMisSample; + acc.x += projRectMis.forwardPdf(u2, projRectMisCache); + acc.x += projRectMis.forwardWeight(u2, projRectMisCache); + acc.x += projRectMis.backwardWeight(prMisSample); + + // AliasTable — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight + ArrayAccessor aliasProb; + aliasProb.data[0] = 0.25; aliasProb.data[1] = 0.5; aliasProb.data[2] = 0.75; aliasProb.data[3] = 1.0; + ArrayAccessor aliasIdx; + aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u; + ArrayAccessor aliasPdf; + aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25; + + // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight + ArrayAccessor cumProb; + cumProb.data[0] = 0.25; cumProb.data[1] = 0.5; cumProb.data[2] = 0.75; + sampling::CumulativeProbabilitySampler > cumSampler = + sampling::CumulativeProbabilitySampler >::create(cumProb, 4u); + sampling::CumulativeProbabilitySampler >::cache_type cumCache; + uint32_t cumBin0 = cumSampler.generate(0.6); + uint32_t cumBin = cumSampler.generate(0.6, cumCache); + acc.x += float32_t(cumBin0 + cumBin); + acc.x += cumSampler.forwardPdf(0.6, cumCache); + acc.x += cumSampler.forwardWeight(0.6, cumCache); + acc.x += cumSampler.backwardPdf(cumBin); + acc.x += cumSampler.backwardWeight(cumBin); + + // PartitionRandVariable — operator() partitions u into a left/right branch + sampling::PartitionRandVariable partition; + partition.leftProb = 0.25; + float32_t partXi = 0.5; + float32_t partRcp; + bool partRight = partition(partXi, partRcp); + acc.x += partXi + partRcp + float32_t(partRight ? 1 : 0); + output[0] = acc; } diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl index d0990ef43..50901e481 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl @@ -5,17 +5,18 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -23,16 +24,20 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::UniformHemisphere sampler; - sampling::UniformHemisphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::UniformHemisphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else UniformHemisphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl index 0d33f5c11..0351e358f 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl @@ -5,17 +5,18 @@ #include #ifdef BENCH_ITERS -[[vk::binding(1, 0)]] RWByteAddressBuffer benchOutput; +#include "../common/sampler_bench_pc.hlsl" +[[vk::push_constant]] SamplerBenchPushConstants benchPC; #else [[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif + [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -23,16 +24,20 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::UniformSphere sampler; - sampling::UniformSphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::UniformSphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } - benchOutput.Store(invID * 4u, acc); + vk::RawBufferStore(benchPC.outputAddress + invID * 4u, acc); #else UniformSphereTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index 8f85545b3..f12ba9421 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -6,326 +6,247 @@ #include #include #include "app_resources/common/discrete_sampler_bench.hlsl" +#include "nbl/examples/Benchmark/IBenchmark.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" #include using namespace nbl; -// Benchmarks alias table vs cumulative probability sampler on the GPU using BDA. -// Builds both tables from the same weight distribution, uploads via BDA buffers, -// and measures GPU throughput using timestamp queries. -class CDiscreteSamplerBenchmark +class CDiscreteSamplerBenchmark : public GPUBenchmark { public: - struct SetupData + // Declared up-front because it's used as the index domain for m_pipelineIdx[] + // (a member-array bound needs the type complete in declaration order). + enum class SamplerKind : uint32_t { - core::smart_refctd_ptr device; - core::smart_refctd_ptr api; - core::smart_refctd_ptr assetMgr; - core::smart_refctd_ptr logger; - video::IPhysicalDevice* physicalDevice; - std::string aliasShaderKey; - std::string cumProbShaderKey; - uint32_t computeFamilyIndex; - uint32_t dispatchGroupCount; - uint32_t tableSize; + AliasPackedA = 0, + AliasPackedB, + CumProbCompare, + CumProbYolo, + CumProbEytzinger, + Count }; - void setup(const SetupData& data) + struct SetupData { - m_device = data.device; - m_logger = data.logger; - m_dispatchGroupCount = data.dispatchGroupCount; - m_tableSize = data.tableSize; - m_physicalDevice = data.physicalDevice; - - m_queue = m_device->getQueue(data.computeFamilyIndex, 0); - - // Command pool + buffers - m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf); - - // Timestamp query pool - { - video::IQueryPool::SCreationParams qp = {}; - qp.queryType = video::IQueryPool::TYPE::TIMESTAMP; - qp.queryCount = 2; - qp.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(qp); - } - - // Generate random weights - const uint32_t N = m_tableSize; - std::vector weights(N); - std::mt19937 rng(42); - std::uniform_real_distribution dist(0.001f, 100.0f); - for (uint32_t i = 0; i < N; i++) - weights[i] = dist(rng); - - // Build alias table - std::vector aliasProb(N); - std::vector aliasIdx(N); - std::vector aliasPdf(N); - std::vector workspace(N); - nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); - - // Build cumulative probability table - std::vector cumProb(N - 1); - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - - // Create BDA buffers and upload data - auto createBdaBuffer = [&](const void* srcData, size_t bytes) -> core::smart_refctd_ptr - { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = bytes; - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - auto buf = m_device->createBuffer(std::move(bp)); - - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + core::smart_refctd_ptr assetMgr; + // Each pipeline is independent; main.cpp can pick precompiled or runtime per + // pipeline by passing ShaderVariant::Precompiled(get_spirv_key<...>()) or + // ShaderVariant::FromSource(path, defines) respectively. + GPUBenchmarkHelper::ShaderVariant packedAliasAVariant; + GPUBenchmarkHelper::ShaderVariant packedAliasBVariant; + GPUBenchmarkHelper::ShaderVariant cumProbVariant; + GPUBenchmarkHelper::ShaderVariant cumProbYoloVariant; + GPUBenchmarkHelper::ShaderVariant cumProbEytzingerVariant; + hlsl::uint32_t3 dispatchGroupCount; + uint64_t targetBudgetMs = 400; // wall-clock budget per sweep row + // N values the sweep cycles through. Dispatch count per row is auto-sized + // by runTimedBudgeted to hit the budget. + std::span sweepNs; + }; - const auto allocSize = alloc.memory->getAllocationSize(); - if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) - { - std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); - // Flush so GPU can see the written data - video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); - m_device->flushMappedMemoryRanges(1u, &flushRange); - alloc.memory->unmap(); - } - return buf; + // Shape is derivable from SetupData; expose it so the caller can use it + // both to configure the bench and to build the matching RunContext for the + // span that runs this bench + static WorkloadShape shapeFor(const SetupData& data) + { + const uint32_t totalThreads = data.dispatchGroupCount.x * data.dispatchGroupCount.y * data.dispatchGroupCount.z * WORKGROUP_SIZE; + const uint64_t samplesPerDispatch = uint64_t(totalThreads) * uint64_t(BENCH_ITERS); + return { + .workgroupSize = {WORKGROUP_SIZE, 1u, 1u}, + .dispatchGroupCount = data.dispatchGroupCount, + .samplesPerDispatch = samplesPerDispatch, }; + } - const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - - // Alias table buffers - m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); - m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); - m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); + CDiscreteSamplerBenchmark(Aggregator& aggregator, const SetupData& data) + : GPUBenchmark(aggregator, GPUBenchmark::SetupData{ + .name = {}, // per-row names synthesized at run time + .warmupDispatches = 0, + .shape = shapeFor(data), + .targetBudgetMs = data.targetBudgetMs, + }) + { + const uint32_t totalThreads = data.dispatchGroupCount.x * data.dispatchGroupCount.y * data.dispatchGroupCount.z * WORKGROUP_SIZE; - // CDF buffer - m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1) * sizeof(float)); + m_assetMgr = data.assetMgr; + m_sweepNs = data.sweepNs; - // Shared output buffer + for (const uint32_t N : m_sweepNs) { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = totalThreads * sizeof(uint32_t); - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - m_outputBuf = m_device->createBuffer(std::move(bp)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + const std::string nStr = std::format("N={}", N); + for (const auto& v : kSweepVariants) + registerVariant({nStr, v.family, v.leaf}); } - // Create pipelines (push constants only, no descriptor sets) - auto loadShader = [&](const std::string& key) - { - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = data.assetMgr->getAsset(key, lp); - auto source = asset::IAsset::castDown(bundle.getContents()[0]); - return m_device->compileShader({.source = source.get()}); - }; - - // Alias table pipeline - { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(AliasTablePushConstants)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create alias pipeline layout", system::ILogger::ELL_ERROR); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - auto shader = loadShader(data.aliasShaderKey); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load alias shader", system::ILogger::ELL_ERROR); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_aliasPipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create alias compute pipeline", system::ILogger::ELL_ERROR); + // Shared output buffer (size only depends on thread count). GPU writes via BDA and + // nothing reads it on the CPU. + m_outputBuf = createBdaOutputBuffer(totalThreads * sizeof(uint32_t)).buf; + + // Pipelines (N-independent; only push constants change per run). Indices + // into m_pipelines (GPUBenchmarkHelper) are stored in the same order as SamplerKind + // so the sweep's variant table can index by enum directly. + m_pipelineIdx[static_cast(SamplerKind::AliasPackedA)] = createPipeline(data.packedAliasAVariant, m_assetMgr, sizeof(PackedAliasABPushConstants), "alias-packed-A"); + m_pipelineIdx[static_cast(SamplerKind::AliasPackedB)] = createPipeline(data.packedAliasBVariant, m_assetMgr, sizeof(PackedAliasABPushConstants), "alias-packed-B"); + m_pipelineIdx[static_cast(SamplerKind::CumProbCompare)] = createPipeline(data.cumProbVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-comparator"); + m_pipelineIdx[static_cast(SamplerKind::CumProbYolo)] = createPipeline(data.cumProbYoloVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-yolo"); + m_pipelineIdx[static_cast(SamplerKind::CumProbEytzinger)] = createPipeline(data.cumProbEytzingerVariant, m_assetMgr, sizeof(CumProbPushConstants), "cumprob-eytzinger"); + } - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_aliasPipeline->getExecutableInfo()); - m_logger->log("Alias Table Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str()); - } - m_aliasPplnLayout = std::move(layout); - } + // Rows are synthesized per (N, variant), not a single named entry, so + // each row checks cli.focusVariants individually. The aggregator's silent + // flag selects which half (focused / unfocused) we contribute to. + void run() override + { + const bool focusedPhase = isFocusPhase(); + // Warmup is small and fixed; budgeted measurement auto-sizes the + // measured-dispatch count to hit getTargetBudgetMs(). + constexpr uint32_t kWarmupDispatches = 64; - // CDF pipeline + for (const uint32_t N : m_sweepNs) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(CumProbPushConstants)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob pipeline layout", system::ILogger::ELL_ERROR); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - auto shader = loadShader(data.cumProbShaderKey); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load cumprob shader", system::ILogger::ELL_ERROR); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - if (m_device->getEnabledFeatures().pipelineExecutableInfo) + const std::string nStr = std::format("N={}", N); + bool built = false; + for (const auto& [family, leaf, kind] : kSweepVariants) { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + core::vector name = {nStr, family, leaf}; + const bool inFocus = isFocused(name); + const bool shouldRun = focusedPhase ? inFocus : !inFocus; + if (!shouldRun) + continue; + if (!built) + { + buildAndUpload(N); + built = true; + } + runSingle(N, std::move(name), kind, kWarmupDispatches); } - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_cumProbPipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob compute pipeline", system::ILogger::ELL_ERROR); - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_cumProbPipeline->getExecutableInfo()); - m_logger->log("Cumulative Probability Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str()); - } - m_cumProbPplnLayout = std::move(layout); + if (built) + releaseTables(); } } - void run(uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) - { - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; - const uint32_t totalThreads = m_dispatchGroupCount * benchWorkgroupSize; - m_logger->log("=== GPU Discrete Sampler Benchmark (N=%u, %u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - system::ILogger::ELL_PERFORMANCE, m_tableSize, benchmarkIterations, totalThreads, BENCH_ITERS); - - runSingle("AliasTable", m_aliasPipeline, m_aliasPplnLayout, true, warmupIterations, benchmarkIterations); - runSingle("CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, false, warmupIterations, benchmarkIterations); - } - private: - void runSingle(const char* name, const core::smart_refctd_ptr& pipeline, const core::smart_refctd_ptr& layout, bool isAlias, uint32_t warmupIterations, uint32_t benchmarkIterations) + // (family, leaf, kind) for every variant the sweep runs. + struct SweepVariant { - m_device->waitIdle(); - - // Record benchmark command buffer - m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); - m_benchCmdbuf->bindComputePipeline(pipeline.get()); - - if (isAlias) - { - AliasTablePushConstants pc = {}; - pc.probAddress = m_aliasProbBuf->getDeviceAddress(); - pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress(); - pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_tableSize; - m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - } - else - { - CumProbPushConstants pc = {}; - pc.cumProbAddress = m_cumProbBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_tableSize; - m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - } - - m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->end(); - - // Record timestamp command buffers - m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdbuf->end(); - - m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdbuf->end(); - - auto semaphore = m_device->createSemaphore(0u); - uint64_t semCounter = 0u; - - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = {{.cmdbuf = m_timestampBeforeCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = {{.cmdbuf = m_timestampAfterCmdbuf.get()}}; - - auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count) - { - const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = { - {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = {cmds, count}; - submit.waitSemaphores = waitSem; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - }; - - for (uint32_t i = 0u; i < warmupIterations; ++i) - submitSerial(benchCmds, 1u); + const char* family; // e.g. "AliasTable" + const char* leaf; // e.g. "packed A, 4 B" + SamplerKind kind; + }; + static constexpr SweepVariant kSweepVariants[] = { + {"AliasTable", "packed A, 4 B", SamplerKind::AliasPackedA}, + {"AliasTable", "packed B, 8 B", SamplerKind::AliasPackedB}, + {"CumulativeProbability", "comparator", SamplerKind::CumProbCompare}, + {"CumulativeProbability", "YOLO", SamplerKind::CumProbYolo}, + {"CumulativeProbability", "Eytzinger", SamplerKind::CumProbEytzinger}, + }; - submitSerial(beforeCmds, 1u); - for (uint32_t i = 0u; i < benchmarkIterations; ++i) - submitSerial(benchCmds, 1u); - submitSerial(afterCmds, 1u); + void buildAndUpload(const uint32_t N) + { + m_currentN = N; - m_device->waitIdle(); + std::vector weights(N); + std::mt19937 rng(42u + N); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < N; i++) + weights[i] = dist(rng); - uint64_t timestamps[2] = {}; - const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) | - core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); + // Build the alias table SoA (intermediate form), then pack it for variants A and B. + // Builder may pad PoT N to N+1 for cache-friendly stride; returned size drives + // every downstream buffer / push-constant value. + std::vector aliasProb; + std::vector aliasIdx; + std::vector aliasPdf; + m_aliasTableN = sampling::AliasTableBuilder::build({weights}, aliasProb, aliasIdx, aliasPdf); + + constexpr uint32_t kPackedLog2N = 26u; + std::vector packedA(m_aliasTableN); + std::vector> packedB(m_aliasTableN); + sampling::AliasTableBuilder::packA({aliasProb}, {aliasIdx}, packedA.data()); + sampling::AliasTableBuilder::packB({aliasProb}, {aliasIdx}, {aliasPdf}, packedB.data()); + + // Cumulative probability (N-1 entries, last bucket implicitly 1.0) + std::vector cumProb(N - 1u); + sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); + + // Eytzinger level-order tree: 2*P entries where P = nextPot(N) + const uint32_t eytzingerP = sampling::eytzingerLeafCount(N); + const uint32_t eytzingerTreeSize = 2u * eytzingerP; + std::vector cumProbEytzinger(eytzingerTreeSize); + sampling::buildEytzinger({weights}, cumProbEytzinger.data()); + + m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), m_aliasTableN * sizeof(float)); + m_packedAliasABuf = createBdaBuffer(packedA.data(), m_aliasTableN * sizeof(uint32_t)); + m_packedAliasBBuf = createBdaBuffer(packedB.data(), m_aliasTableN * sizeof(sampling::PackedAliasEntryB)); + m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1u) * sizeof(float)); + m_cumProbEytzingerBuf = createBdaBuffer(cumProbEytzinger.data(), eytzingerTreeSize * sizeof(float)); + } - constexpr uint32_t benchIters = BENCH_ITERS; - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; - const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(benchWorkgroupSize); - const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); - const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); - const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; - const float64_t elapsed_ms = elapsed_ns * 1e-6; + void releaseTables() + { + m_aliasPdfBuf = nullptr; + m_packedAliasABuf = nullptr; + m_packedAliasBBuf = nullptr; + m_cumProbBuf = nullptr; + m_cumProbEytzingerBuf = nullptr; + } - m_logger->log("[Benchmark] %-28s: %9.3f ps/sample | %10.3f GSamples/s | %10.3f ms total", system::ILogger::ELL_PERFORMANCE, name, ps_per_sample, gsamples_per_s, elapsed_ms); + void runSingle(uint32_t N, core::vector name, SamplerKind kind, uint32_t warmupIterations) + { + // Pipeline + push constants are bound *once* in bindOnce, the inner loop is just + // dispatch(...). Putting binds inside dispatchOne would inflate ps/sample on the + // tighter samplers. + const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx[size_t(kind)], joinName(name)); + if (!pe) + return; + + const TimingResult timingResult = runTimedBudgeted(warmupIterations, getTargetBudgetMs(), + [&](IGPUCommandBuffer* cb) + { + if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB) + { + PackedAliasABPushConstants pc = {}; + pc.entriesAddress = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress(); + pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = m_aliasTableN; + defaultBindAndPush(cb, *pe, pc); + } + else + { + CumProbPushConstants pc = {}; + const auto& buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf; + pc.cumProbAddress = buf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = N; + defaultBindAndPush(cb, *pe, pc); + } + }, + [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); }, + samplesForCurrentRow()); + + record(std::move(name), timingResult, pe->stats); } - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_logger; - core::smart_refctd_ptr m_cmdpool; - core::smart_refctd_ptr m_benchCmdbuf; - core::smart_refctd_ptr m_timestampBeforeCmdbuf; - core::smart_refctd_ptr m_timestampAfterCmdbuf; - core::smart_refctd_ptr m_queryPool; + core::smart_refctd_ptr m_assetMgr; - // Alias table - core::smart_refctd_ptr m_aliasPplnLayout; - core::smart_refctd_ptr m_aliasPipeline; - core::smart_refctd_ptr m_aliasProbBuf; - core::smart_refctd_ptr m_aliasIdxBuf; - core::smart_refctd_ptr m_aliasPdfBuf; + // Indices into m_pipelines (GPUBenchmarkHelper), indexed by SamplerKind. + uint32_t m_pipelineIdx[size_t(SamplerKind::Count)] = {}; - // Cumulative probability - core::smart_refctd_ptr m_cumProbPplnLayout; - core::smart_refctd_ptr m_cumProbPipeline; - core::smart_refctd_ptr m_cumProbBuf; + // Per-N data buffers (rebuilt each sweep step). pdf[] is shared between A and B. + core::smart_refctd_ptr m_aliasPdfBuf; + core::smart_refctd_ptr m_packedAliasABuf; + core::smart_refctd_ptr m_packedAliasBBuf; + core::smart_refctd_ptr m_cumProbBuf; + core::smart_refctd_ptr m_cumProbEytzingerBuf; // Shared - core::smart_refctd_ptr m_outputBuf; - video::IQueue* m_queue = nullptr; - video::IPhysicalDevice* m_physicalDevice = nullptr; - uint32_t m_dispatchGroupCount = 0; - uint32_t m_tableSize = 0; + core::smart_refctd_ptr m_outputBuf; + uint32_t m_currentN = 0; + uint32_t m_aliasTableN = 0; + std::span m_sweepNs; }; #endif diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index 3e2092670..7410b7242 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h @@ -7,259 +7,56 @@ #include #include "nbl/examples/examples.hpp" +#include "nbl/examples/Benchmark/IBenchmark.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" +#include "app_resources/common/sampler_bench_pc.hlsl" using namespace nbl; // Measures GPU execution time of a sampler shader using GPU timestamp queries. -class CSamplerBenchmark +// Output is implicit BDA addressed via SamplerBenchPushConstants. GPU plumbing +// (pipeline / buffer / timestamp queries) comes from GPUBenchmarkHelper; the +// bench-side glue here is PC layout + per-run dispatch + result recording. +class CSamplerBenchmark : public GPUBenchmark { -public: - struct SetupData - { - core::smart_refctd_ptr device; - core::smart_refctd_ptr api; - core::smart_refctd_ptr assetMgr; - core::smart_refctd_ptr logger; - video::IPhysicalDevice* physicalDevice; - uint32_t computeFamilyIndex; - std::string shaderKey; - uint32_t dispatchGroupCount; // workgroup count = testBatchCount - uint32_t samplesPerDispatch; // dispatchGroupCount * WorkgroupSize * benchIters - size_t inputBufferBytes; // sizeof(InputType) * samplesPerDispatch - size_t outputBufferBytes; // sizeof(ResultType) * samplesPerDispatch - }; - - void setup(const SetupData& data) - { - m_device = data.device; - m_logger = data.logger; - m_dispatchGroupCount = data.dispatchGroupCount; - - // Command pool + 3 command buffers: benchmark (multi-submit), before/after timestamp - m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create timestamp-before cmdbuf", system::ILogger::ELL_ERROR); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create timestamp-after cmdbuf", system::ILogger::ELL_ERROR); - - // Timestamp query pool (2 queries: before and after) - { - video::IQueryPool::SCreationParams qparams = {}; - qparams.queryType = video::IQueryPool::TYPE::TIMESTAMP; - qparams.queryCount = 2; - qparams.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(qparams); - if (!m_queryPool) - m_logger->log("CSamplerBenchmark: failed to create query pool", system::ILogger::ELL_ERROR); - } - - // Load and compile shader - core::smart_refctd_ptr shader; - { - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = data.assetMgr->getAsset(data.shaderKey, lp); - const auto assets = bundle.getContents(); - if (assets.empty()) - { - m_logger->log("CSamplerBenchmark: failed to load shader", system::ILogger::ELL_ERROR); - return; - } - auto source = asset::IAsset::castDown(assets[0]); - shader = m_device->compileShader({ source.get() }); - } - - // Descriptor set layout: binding 0 = input SSBO, binding 1 = output SSBO - video::IGPUDescriptorSetLayout::SBinding bindings[2] = { - { .binding = 0, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 }, - { .binding = 1, .type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, .count = 1 } - }; - auto dsLayout = m_device->createDescriptorSetLayout(bindings); - - m_pplnLayout = m_device->createPipelineLayout({}, core::smart_refctd_ptr(dsLayout)); - - { - video::IGPUComputePipeline::SCreationParams pparams = {}; - pparams.layout = m_pplnLayout.get(); - pparams.shader.entryPoint = "main"; - pparams.shader.shader = shader.get(); - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pparams.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!m_device->createComputePipelines(nullptr, { &pparams, 1 }, &m_pipeline)) - m_logger->log("CSamplerBenchmark: failed to create compute pipeline", system::ILogger::ELL_ERROR); - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - m_executableReport = system::to_string(m_pipeline->getExecutableInfo()); - } - - // Allocate input buffer (host-visible, zero-filled, correctness irrelevant for benchmarking) - core::smart_refctd_ptr inputBuf; - { - video::IGPUBuffer::SCreationParams bparams = {}; - bparams.size = data.inputBufferBytes; - bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - inputBuf = m_device->createBuffer(std::move(bparams)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_inputAlloc.isValid()) - m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR); - if (m_inputAlloc.memory->map({ 0ull, m_inputAlloc.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ)) - { - std::memset(m_inputAlloc.memory->getMappedPointer(), 0, m_inputAlloc.memory->getAllocationSize()); - m_inputAlloc.memory->unmap(); - } - } - - // Allocate output buffer (host-visible, GPU writes garbage, never read back) - core::smart_refctd_ptr outputBuf; - { - video::IGPUBuffer::SCreationParams bparams = {}; - bparams.size = data.outputBufferBytes; - bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - outputBuf = m_device->createBuffer(std::move(bparams)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_outputAlloc.isValid()) - m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR); - } - - // Descriptor set: bind both buffers - auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); - m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); - { - video::IGPUDescriptorSet::SDescriptorInfo info[2]; - info[0].desc = core::smart_refctd_ptr(inputBuf); - info[0].info.buffer = { .offset = 0, .size = data.inputBufferBytes }; - info[1].desc = core::smart_refctd_ptr(outputBuf); - info[1].info.buffer = { .offset = 0, .size = data.outputBufferBytes }; - video::IGPUDescriptorSet::SWriteDescriptorSet writes[2] = { - { .dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = &info[0] }, - { .dstSet = m_ds.get(), .binding = 1, .arrayElement = 0, .count = 1, .info = &info[1] } - }; - m_device->updateDescriptorSets(writes, {}); - } - - m_queue = m_device->getQueue(data.computeFamilyIndex, 0); - m_samplesPerDispatch = data.samplesPerDispatch; - m_physicalDevice = data.physicalDevice; - } - - void logPipelineReport(const std::string& name) const + public: + struct SetupData : GPUBenchmark::SetupData { - if (!m_executableReport.empty()) - m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str()); - } + core::smart_refctd_ptr assetMgr; + GPUBenchmarkHelper::ShaderVariant variant; // precompiled key OR source path + defines + size_t outputBufferBytes; // sizeof(uint32_t) * threadsPerDispatch + }; - // Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps. - void run(const std::string& samplerName, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) - { - m_device->waitIdle(); - recordBenchmarkCmdBuf(); - recordTimestampCmdBufs(); - - auto semaphore = m_device->createSemaphore(0u); - uint64_t semCounter = 0u; - - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = { {.cmdbuf = m_timestampBeforeCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = { {.cmdbuf = m_timestampAfterCmdbuf.get()} }; - - // Chains submissions via a timeline semaphore so they execute strictly in order - auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count) - { - const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = { - {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = {cmds, count}; - submit.waitSemaphores = waitSem; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - }; - - for (uint32_t i = 0u; i < warmupIterations; ++i) - submitSerial(benchCmds, 1u); - - submitSerial(beforeCmds, 1u); - for (uint32_t i = 0u; i < benchmarkIterations; ++i) - submitSerial(benchCmds, 1u); - submitSerial(afterCmds, 1u); - - m_device->waitIdle(); - - uint64_t timestamps[2] = {}; - const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) | - core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); - - const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t total_samples = uint64_t(benchmarkIterations) * uint64_t(m_samplesPerDispatch); - const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(total_samples); - const float64_t gsamples_per_s = float64_t(total_samples) / elapsed_ns; - const float64_t elapsed_ms = elapsed_ns * 1e-6; - - m_logger->log("[Benchmark] %-28s: %9.3f ps/sample | %10.3f GSamples/s | %10.3f ms total", - system::ILogger::ELL_PERFORMANCE, - samplerName.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms); - } - -private: - void recordBenchmarkCmdBuf() - { - m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); - m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); - m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchmarkCmdbuf->end(); - } - - void recordTimestampCmdBufs() - { - m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdbuf->end(); + CSamplerBenchmark(Aggregator& aggregator, const SetupData& data) + : GPUBenchmark(aggregator, data) // slicing-copy of the GPUBenchmark::SetupData base + { + auto bda = createBdaOutputBuffer(data.outputBufferBytes); + m_outputBuf = std::move(bda.buf); + m_outputAddress = bda.address; - m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdbuf->end(); - } + m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(SamplerBenchPushConstants), joinName(data.name)); + } - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_logger; - core::smart_refctd_ptr m_cmdpool; - core::smart_refctd_ptr m_benchmarkCmdbuf; - core::smart_refctd_ptr m_timestampBeforeCmdbuf; - core::smart_refctd_ptr m_timestampAfterCmdbuf; - core::smart_refctd_ptr m_queryPool; - core::smart_refctd_ptr m_pplnLayout; - core::smart_refctd_ptr m_pipeline; - core::smart_refctd_ptr m_ds; - video::IDeviceMemoryAllocator::SAllocation m_inputAlloc = {}; - video::IDeviceMemoryAllocator::SAllocation m_outputAlloc = {}; - video::IQueue* m_queue = nullptr; - video::IPhysicalDevice* m_physicalDevice = nullptr; - uint32_t m_dispatchGroupCount = 0; - uint32_t m_samplesPerDispatch = 0; - std::string m_executableReport; + void doRun() override + { + const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx, joinName(m_name)); + if (!pe) + return; + SamplerBenchPushConstants pc = {}; + pc.outputAddress = m_outputAddress; + + const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(), + [&](video::IGPUCommandBuffer* cb) { defaultBindAndPush(cb, *pe, pc); }, + [this](video::IGPUCommandBuffer* cb) { defaultDispatch(cb); }, + samplesForCurrentRow()); + + record(m_name, t, pe->stats); + } + + private: + core::smart_refctd_ptr m_outputBuf; + uint64_t m_outputAddress = 0; + uint32_t m_pipelineIdx = 0; }; #endif diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index 98ea127cc..1c3f6000d 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -1,5 +1,8 @@ #include +#include +#include + #include "nbl/examples/examples.hpp" #include "nbl/this_example/builtin/build/spirv/keys.hpp" @@ -49,14 +52,14 @@ using namespace nbl::examples; #include "benchmarks/CSamplerBenchmark.h" #include "benchmarks/CDiscreteSamplerBenchmark.h" +#include "nbl/examples/Tester/FailureManifest.h" #include "tests/property/CSamplerPropertyTester.h" -constexpr bool DoBenchmark = true; class HLSLSamplingTests final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; + using asset_base_t = BuiltinResourcesApplication; public: HLSLSamplingTests(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -64,7 +67,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override { - auto retval = device_base_t::getPreferredDeviceFeatures(); + auto retval = device_base_t::getPreferredDeviceFeatures(); retval.pipelineExecutableInfo = true; return retval; } @@ -80,10 +83,10 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // test compile with dxc { IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); - auto bundle = m_assetMgr->getAsset(key.c_str(), lp); + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); + auto bundle = m_assetMgr->getAsset(key.c_str(), lp); const auto assets = bundle.getContents(); if (assets.empty()) @@ -110,12 +113,19 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Note: all samplers almost satisfy BasicSampler, but they have cache parameters in generate(). static_assert(sampling::concepts::BasicSampler>); static_assert(sampling::concepts::BasicSampler>); - static_assert(sampling::concepts::BasicSampler); - static_assert(sampling::concepts::BasicSampler); + static_assert(sampling::concepts::BasicSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::BasicSampler, sampling::YOLO>>); + static_assert(sampling::concepts::BasicSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::BasicSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::BasicSampler, 4>, ReadOnlyAccessor, 26>>); // --- TractableSampler (level 2) --- generate(domain_type, out cache_type) -> codomain_type, forwardPdf(domain_type, cache_type) -> density_type - static_assert(sampling::concepts::TractableSampler); - static_assert(sampling::concepts::TractableSampler); + ; + static_assert(sampling::concepts::TractableSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::TractableSampler, sampling::YOLO>>); + static_assert(sampling::concepts::TractableSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::TractableSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::TractableSampler, 4>, ReadOnlyAccessor, 26>>); static_assert(sampling::concepts::TractableSampler>); static_assert(sampling::concepts::TractableSampler>); static_assert(sampling::concepts::TractableSampler>); @@ -131,8 +141,11 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::TractableSampler>); // --- ResamplableSampler (level 3, parallel) --- generate(domain_type, out cache_type) -> codomain_type, forwardWeight(domain_type, cache_type), backwardWeight(codomain_type) - static_assert(sampling::concepts::ResamplableSampler); - static_assert(sampling::concepts::ResamplableSampler); + static_assert(sampling::concepts::ResamplableSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::ResamplableSampler, sampling::YOLO>>); + static_assert(sampling::concepts::ResamplableSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::ResamplableSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::ResamplableSampler, 4>, ReadOnlyAccessor, 26>>); static_assert(sampling::concepts::ResamplableSampler>); static_assert(sampling::concepts::ResamplableSampler>); static_assert(sampling::concepts::ResamplableSampler>); @@ -155,8 +168,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); - static_assert(sampling::concepts::BackwardTractableSampler>); - static_assert(sampling::concepts::BackwardTractableSampler>); + //static_assert(sampling::concepts::BackwardTractableSampler>); // no backwardPdf + //static_assert(sampling::concepts::BackwardTractableSampler>); // no backwardPdf static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); @@ -166,7 +179,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); - static_assert(sampling::concepts::BijectiveSampler>); + static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); @@ -177,92 +190,175 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat m_logger->log("All sampling concept tests passed.", ILogger::ELL_INFO); + const auto runControl = nbl::examples::testing::parseRunControl(this->argv, m_logger.get()); + if (!runControl.valid) + return false; + + nbl::examples::testing::FailureManifest failureManifest("37_HLSLSamplingTests"); + // ====================================================================== // GPU throughput benchmarks // ====================================================================== - const uint32_t testBatchCount = 1024; + constexpr uint32_t benchWorkgroupsCount = 4096; + constexpr bool DoBenchmark = true; if constexpr (DoBenchmark) { - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; - constexpr uint32_t totalThreadsPerDispatch = testBatchCount * benchWorkgroupSize; - constexpr uint32_t iterationsPerThread = BENCH_ITERS; - constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread; - - struct BenchEntry + if (runControl.skipBenchmarks) { - CSamplerBenchmark bench; - std::string name; + m_logger->log("Skipping benchmark phase due to CLI.", ILogger::ELL_INFO); + } + else + { + constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; + constexpr uint32_t totalThreadsPerDispatch = benchWorkgroupsCount * benchWorkgroupSize; + constexpr uint32_t iterationsPerThread = BENCH_ITERS; + constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread; + constexpr uint32_t warmupDispatches = 300; // unmeasured warmup + cooldown around the timing window + constexpr uint64_t targetBudgetMs = 400; // wall-clock per row; runTimedBudgeted sizes dispatches + + std::vector benchmarks; + + // Single Aggregator owns results, baselines, formatting, and reporting + // for both bench classes. Passed by reference into each bench's ctor. + Aggregator agg(m_logger, m_device, m_physicalDevice, getComputeQueue()->getFamilyIndex()); + const auto cli = agg.applyCli({ + .argv = this->argv, + .defaultOutputPath = "SamplerBench.json", + .appName = "37_HLSLSamplingTests", + }); + + // One context for the whole sampler-bench span: drives both the per-bench + // shape/budget and the banner that runSessionAndReport prints. + const RunContext samplerCtx = { + .shape = { + .workgroupSize = {benchWorkgroupSize, 1u, 1u}, + .dispatchGroupCount = {benchWorkgroupsCount, 1u, 1u}, + .samplesPerDispatch = benchSamplesPerDispatch, + }, + .targetBudgetMs = targetBudgetMs, + .sectionLabel = "GPU Sampler Benchmarks", }; - std::vector benchmarks; - auto addBench = [&](const char* name, const std::string& shaderKey, size_t inputSize, size_t outputSize) + auto addBench = [&](const std::initializer_list name, GPUBenchmarkHelper::ShaderVariant variant, size_t outputSize) { - auto& entry = benchmarks.emplace_back(); - entry.name = name; - CSamplerBenchmark::SetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; - data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = shaderKey; - data.dispatchGroupCount = testBatchCount; - data.samplesPerDispatch = benchSamplesPerDispatch; - data.inputBufferBytes = inputSize; + data.assetMgr = m_assetMgr; + data.name = name; + data.variant = std::move(variant); data.outputBufferBytes = outputSize; - entry.bench.setup(data); + data.warmupDispatches = warmupDispatches; + data.shape = samplerCtx.shape; + data.targetBudgetMs = samplerCtx.targetBudgetMs; + + benchmarks.emplace_back(agg, data); }; - // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer - constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks - constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; - addBench("Linear", nbl::this_example::builtin::build::get_spirv_key<"linear_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("Bilinear", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("BoxMullerTransform", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformHemisphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformSphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ConcentricMapping", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("PolarMapping", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedHemisphere", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphere", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - - // Print all pipeline reports first - for (auto& entry : benchmarks) - entry.bench.logPipelineReport(entry.name); + // Convenience wrappers so the 35+ existing precompiled-key calls below stay + // one line each, and adding a new runtime variant is also a one-liner without + // CMake JSON edits. Both go through the same addBench, just construct the + // ShaderVariant differently. + auto addPrecompiled = [&](std::initializer_list name, size_t outputSize) + { + auto shader = nbl::this_example::builtin::build::get_spirv_key(m_device.get()); + addBench(name, GPUBenchmarkHelper::ShaderVariant::Precompiled(std::move(shader)), outputSize); + }; + auto addRuntime = [&](std::initializer_list name, const char* sourcePath, std::vector defines, size_t outputSize) + { + // Mirror CMake's COMMON_OPTIONS so runtime variants see the same baseline + // as precompiled ones. + std::vector all = { + {"WORKGROUP_SIZE", std::to_string(WORKGROUP_SIZE)}, + {"BENCH_ITERS", std::to_string(BENCH_ITERS)}, + }; + all.insert(all.end(), std::make_move_iterator(defines.begin()), std::make_move_iterator(defines.end())); + addBench(name, GPUBenchmarkHelper::ShaderVariant::FromSource(sourcePath, std::move(all)), outputSize); + }; + + // Bench shaders don't read input -- output is BDA via push constants. + if constexpr (true) + { + constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; + addPrecompiled.operator()<"linear_bench_1_1">({"Linear", "Linear", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"linear_bench_1_16">({"Linear", "Linear", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"bilinear_bench_1_1">({"Linear", "Bilinear", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"bilinear_bench_1_16">({"Linear", "Bilinear", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"box_muller_transform_bench_1_1">({"Gaussian", "BoxMullerTransform", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"box_muller_transform_bench_1_16">({"Gaussian", "BoxMullerTransform", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_hemisphere_bench_1_1">({"SphereSampling", "UniformHemisphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_hemisphere_bench_1_16">({"SphereSampling", "UniformHemisphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_sphere_bench_1_1">({"SphereSampling", "UniformSphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"uniform_sphere_bench_1_16">({"SphereSampling", "UniformSphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_hemisphere_bench_1_1">({"SphereSampling", "ProjectedHemisphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_hemisphere_bench_1_16">({"SphereSampling", "ProjectedHemisphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_sphere_bench_1_1">({"SphereSampling", "ProjectedSphere", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_sphere_bench_1_16">({"SphereSampling", "ProjectedSphere", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"concentric_mapping_bench_1_1">({"DiskMappers", "ConcentricMapping", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"concentric_mapping_bench_1_16">({"DiskMappers", "ConcentricMapping", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"polar_mapping_bench_1_1">({"DiskMappers", "PolarMapping", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"polar_mapping_bench_1_16">({"DiskMappers", "PolarMapping", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_1_shape_observer">({"SphShapes", "SphRect", "1:1", "shape,observer"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_1_sa_extents">({"SphShapes", "SphRect", "1:1", "sa,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_1_r0_extents">({"SphShapes", "SphRect", "1:1", "r0,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_16_shape_observer">({"SphShapes", "SphRect", "1:16", "shape,observer"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_16_sa_extents">({"SphShapes", "SphRect", "1:16", "sa,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_1_16_r0_extents">({"SphShapes", "SphRect", "1:16", "r0,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_create_only_shape_observer">({"SphShapes", "SphRect", "create-only", "shape,observer"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_create_only_sa_extents">({"SphShapes", "SphRect", "create-only", "sa,extents"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_rectangle_bench_create_only_r0_extents">({"SphShapes", "SphRect", "create-only", "r0,extents"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_rectangle_bench_1_1">({"SphShapes", "ProjSphRect", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_rectangle_bench_1_16">({"SphShapes", "ProjSphRect", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_rectangle_bench_create_only">({"SphShapes", "ProjSphRect", "create-only"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_triangle_bench_1_1">({"SphShapes", "SphTri", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_triangle_bench_1_16">({"SphShapes", "SphTri", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"spherical_triangle_bench_create_only">({"SphShapes", "SphTri", "create-only"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_triangle_bench_1_1">({"SphShapes", "ProjSphTri", "1:1"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_triangle_bench_1_16">({"SphShapes", "ProjSphTri", "1:16"}, benchOutputBytes); + addPrecompiled.operator()<"projected_spherical_triangle_bench_create_only">({"SphShapes", "ProjSphTri", "create-only"}, benchOutputBytes); + // ---- Runtime-compiled demo variants (no CMake JSON edit needed) ---- + // Same .hlsl source as the precompiled "linear_bench_1_*" entries, but with + // a `BENCH_SAMPLES_PER_CREATE` value that has no JSON entry. Add as many + // here as you want -- each is a one-liner, no reconfigure required. + //addRuntime({"Linear", "Linear", "1:4 (rt)"}, "shaders/linear_test.comp.hlsl", {{"BENCH_SAMPLES_PER_CREATE", "4"}}, benchOutputBytes); + //addRuntime({"Linear", "Linear", "1:8 (rt)"}, "shaders/linear_test.comp.hlsl", {{"BENCH_SAMPLES_PER_CREATE", "8"}}, benchOutputBytes); + } // Discrete sampler benchmark: alias table vs cumulative probability (BDA) { CDiscreteSamplerBenchmark::SetupData dsData; - dsData.device = m_device; - dsData.api = m_api; - dsData.assetMgr = m_assetMgr; - dsData.logger = m_logger; - dsData.physicalDevice = m_physicalDevice; - dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); - dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); - dsData.dispatchGroupCount = testBatchCount; - dsData.tableSize = 1024; - - CDiscreteSamplerBenchmark discreteBench; - discreteBench.setup(dsData); - - // Then run all benchmarks here so the reports are at the top of the log, followed by timings - constexpr uint32_t warmupDispatches = 500; - constexpr uint32_t benchDispatches = 5000; - m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread); - for (auto& entry : benchmarks) - entry.bench.run(entry.name, warmupDispatches, benchDispatches); - - discreteBench.run(warmupDispatches, benchDispatches); + dsData.assetMgr = m_assetMgr; + dsData.packedAliasAVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get())); + dsData.packedAliasBVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get())); + dsData.cumProbVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get())); + dsData.cumProbYoloVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get())); + dsData.cumProbEytzingerVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get())); + dsData.dispatchGroupCount = {benchWorkgroupsCount, 1u, 1u}; + dsData.targetBudgetMs = targetBudgetMs; + + // Just the N values now -- runTimedBudgeted sizes dispatches per + // row to hit the budget. The old per-N tuning table is gone. + static constexpr uint32_t kSweepNs[] = { + 2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, + 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u, + 65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u}; + dsData.sweepNs = kSweepNs; + + CDiscreteSamplerBenchmark discreteBench(agg, dsData); + + const RunContext discreteCtx = { + .shape = CDiscreteSamplerBenchmark::shapeFor(dsData), + .targetBudgetMs = targetBudgetMs, + .sectionLabel = "Discrete Sampler Sweep", + }; + + // Single call. Each span contributes its own focus rows first, then + // every span's unfocused rows -- the aggregator iterates both packs + // in each phase. CDiscrete's overridden run() does per-row filtering + // against cli.focusVariants since its rows aren't a flat list. + agg.runSessionAndReport( + Aggregator::makeSpan(benchmarks, samplerCtx), + Aggregator::makeSpan(discreteBench, discreteCtx)); + } } } @@ -270,57 +366,80 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Runtime CPU/GPU comparison tests using ITester harness // ================================================================ bool pass = true; - const uint32_t workgroupSize = WORKGROUP_SIZE; - + constexpr uint32_t testWorkgroupsCount = 4096; + bool samplerPass = true; // generic lambda to run a GPU sampler test - auto runSamplerTest = [&](const char* testName, auto spirvKey, const char* logFile) + auto runSamplerTest = [&](const char* id, const char* testName, const char* logFile) { + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping %s tests due to filter.", ILogger::ELL_INFO, testName); + return; + } + m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName); typename Tester::PipelineSetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; + data.device = m_device; + data.api = m_api; + data.assetMgr = m_assetMgr; + data.logger = m_logger; + data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = spirvKey; - Tester tester(testBatchCount, workgroupSize); + data.shaderKey = std::move(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + Tester tester(testWorkgroupsCount); tester.setupPipeline(data); - pass &= tester.performTestsAndVerifyResults(logFile); + if (const auto seed = runControl.filter.seedFor(id); seed.has_value()) + tester.setSeed(*seed); + tester.setFailureRecordContext(&failureManifest, "sampler", id, testName); + samplerPass &= tester.performTestsAndVerifyResults(logFile); }; // --- Sampler tests --- if constexpr (true) { - runSamplerTest.operator()("Linear sampler", nbl::this_example::builtin::build::get_spirv_key<"linear_test">(m_device.get()), "LinearTestLog.txt"); - runSamplerTest.operator()("Bilinear sampler", nbl::this_example::builtin::build::get_spirv_key<"bilinear_test">(m_device.get()), "BilinearTestLog.txt"); - runSamplerTest.operator()("UniformHemisphere sampler", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_test">(m_device.get()), "UniformHemisphereTestLog.txt"); - runSamplerTest.operator()("UniformSphere sampler", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_test">(m_device.get()), "UniformSphereTestLog.txt"); - runSamplerTest.operator()("ProjectedHemisphere sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_test">(m_device.get()), "ProjectedHemisphereTestLog.txt"); - runSamplerTest.operator()("ProjectedSphere sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_test">(m_device.get()), "ProjectedSphereTestLog.txt"); - runSamplerTest.operator()("ConcentricMapping sampler", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_test">(m_device.get()), "ConcentricMappingTestLog.txt"); - runSamplerTest.operator()("PolarMapping sampler", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_test">(m_device.get()), "PolarMappingTestLog.txt"); - runSamplerTest.operator()("BoxMullerTransform sampler", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_test">(m_device.get()), "BoxMullerTransformTestLog.txt"); - runSamplerTest.operator()("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle">(m_device.get()), "SphericalTriangleTestLog.txt"); - runSamplerTest.operator()("ProjectedSphericalTriangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_test">(m_device.get()), "ProjectedSphericalTriangleTestLog.txt"); - runSamplerTest.operator()("SphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_test">(m_device.get()), "SphericalRectangleTestLog.txt"); - runSamplerTest.operator()("ProjectedSphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_test">(m_device.get()), "ProjectedSphericalRectangleTestLog.txt"); + runSamplerTest.operator()("sampler/Linear", "Linear sampler", "LinearTestLog.txt"); + runSamplerTest.operator()("sampler/Bilinear", "Bilinear sampler", "BilinearTestLog.txt"); + runSamplerTest.operator()("sampler/UniformHemisphere", "UniformHemisphere sampler", "UniformHemisphereTestLog.txt"); + runSamplerTest.operator()("sampler/UniformSphere", "UniformSphere sampler", "UniformSphereTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedHemisphere", "ProjectedHemisphere sampler", "ProjectedHemisphereTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedSphere", "ProjectedSphere sampler", "ProjectedSphereTestLog.txt"); + runSamplerTest.operator()("sampler/ConcentricMapping", "ConcentricMapping sampler", "ConcentricMappingTestLog.txt"); + runSamplerTest.operator()("sampler/PolarMapping", "PolarMapping sampler", "PolarMappingTestLog.txt"); + runSamplerTest.operator()("sampler/BoxMullerTransform", "BoxMullerTransform sampler", "BoxMullerTransformTestLog.txt"); + runSamplerTest.operator()("sampler/SphericalTriangle", "SphericalTriangle", "SphericalTriangleTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedSphericalTriangle", "ProjectedSphericalTriangle sampler", "ProjectedSphericalTriangleTestLog.txt"); + runSamplerTest.operator()("sampler/SphericalRectangle", "SphericalRectangle sampler", "SphericalRectangleTestLog.txt"); + runSamplerTest.operator()("sampler/ProjectedSphericalRectangle", "ProjectedSphericalRectangle sampler", "ProjectedSphericalRectangleTestLog.txt"); } if constexpr (true) { // --- Discrete table construction (CPU) --- { - m_logger->log("Running discrete table builder tests (CPU)...", ILogger::ELL_INFO); - CDiscreteTableTester tableTester(m_logger.get()); - pass &= tableTester.run(); + constexpr const char* id = "sampler/DiscreteTableBuilder"; + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping discrete table builder tests due to filter.", ILogger::ELL_INFO); + } + else + { + m_logger->log("Running discrete table builder tests (CPU)...", ILogger::ELL_INFO); + CDiscreteTableTester tableTester(m_logger.get()); + const bool ok = tableTester.run(); + samplerPass &= ok; + if (!ok) + failureManifest.addGroupFailure("sampler", id, "Discrete table builder"); + } } // --- GPU table sampler tests --- - runSamplerTest.operator()("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt"); - runSamplerTest.operator()("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt"); + runSamplerTest.operator()("sampler/PackedAliasA", "PackedAliasA GPU sampler", "PackedAliasATestLog.txt"); + runSamplerTest.operator()("sampler/PackedAliasB", "PackedAliasB GPU sampler", "PackedAliasBTestLog.txt"); + runSamplerTest.operator()("sampler/CumulativeProbability", "CumulativeProbability GPU sampler", "CumulativeProbabilityTestLog.txt"); } - if (pass) + logJacobianSkipCounts(m_logger.get()); + pass &= samplerPass; + if (samplerPass) m_logger->log("All sampling tests PASSED.", ILogger::ELL_INFO); else m_logger->log("Some sampling tests FAILED. Check log files for details.", ILogger::ELL_ERROR); @@ -330,66 +449,55 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ if constexpr (true) { + bool propertyPass = true; m_logger->log("Running sampler property tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); - CSamplerPropertyTester linearProps(m_logger.get()); - pass &= linearProps.run(); - - CSamplerPropertyTester bilinearProps(m_logger.get()); - pass &= bilinearProps.run(); - - CSamplerPropertyTester uniformHemiProps(m_logger.get()); - pass &= uniformHemiProps.run(); - - CSamplerPropertyTester uniformSphereProps(m_logger.get()); - pass &= uniformSphereProps.run(); - - CSamplerPropertyTester projHemiProps(m_logger.get()); - pass &= projHemiProps.run(); - - CSamplerPropertyTester projSphereProps(m_logger.get()); - pass &= projSphereProps.run(); - - CSamplerPropertyTester concentricProps(m_logger.get()); - pass &= concentricProps.run(); - - CSamplerPropertyTester polarProps(m_logger.get()); - pass &= polarProps.run(); - - CSamplerPropertyTester boxMullerProps(m_logger.get()); - pass &= boxMullerProps.run(); - - CSamplerPropertyTester sphTriProps(m_logger.get()); - pass &= sphTriProps.run(); - - CSamplerPropertyTester projSphTriProps(m_logger.get()); - pass &= projSphTriProps.run(); - - CSamplerPropertyTester sphRectProps(m_logger.get()); - pass &= sphRectProps.run(); + auto check = [&]() + { + const std::string id = std::string("property/") + Config::name(); + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping %s property tests due to filter.", ILogger::ELL_INFO, Config::name()); + return; + } + + CSamplerPropertyTester tester(m_logger.get(), runControl.filter.seedFor(id)); + const bool ok = tester.run(); + propertyPass &= ok; + if (!ok) + { + failureManifest.addGroupFailure("property", id, Config::name()); + if (const auto seed = tester.failureSeed(); seed.has_value()) + failureManifest.addCase("property", id, Config::name(), "property", "CPU", 0, *seed, 0.0, 0.0); + } + }; - CSamplerPropertyTester projSphRectProps(m_logger.get()); - pass &= projSphRectProps.run(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); // Stress tests: extreme coefficient ratios - CSamplerPropertyTester linearStress(m_logger.get()); - pass &= linearStress.run(); - - CSamplerPropertyTester bilinearStress(m_logger.get()); - pass &= bilinearStress.run(); - - CSamplerPropertyTester bilinearPST(m_logger.get()); - pass &= bilinearPST.run(); - - CSamplerPropertyTester sphTriStress(m_logger.get()); - pass &= sphTriStress.run(); + check.operator()(); + check.operator()(); + check.operator()(); + check.operator()(); // Grazing angle tests - CSamplerPropertyTester grazingProps(m_logger.get()); - pass &= grazingProps.run(); + check.operator()(); - if (pass) + pass &= propertyPass; + if (propertyPass) m_logger->log("All sampler property tests PASSED.", ILogger::ELL_INFO); else m_logger->log("Some sampler property tests FAILED.", ILogger::ELL_ERROR); @@ -398,34 +506,43 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ // Solid angle accuracy and small triangle convergence tests (CPU-only) // ================================================================ + if constexpr (true) { + bool geometryPass = true; m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); - CSolidAngleAccuracyTester solidAngleTester(m_logger.get()); - pass &= solidAngleTester.run(); - - CSphericalTriangleGenerateTester sphTriGenTester(m_logger.get()); - pass &= sphTriGenTester.run(); - - CSphericalRectangleGenerateTester sphRectGenTester(m_logger.get()); - pass &= sphRectGenTester.run(); - - CProjectedSphericalRectangleGenerateTester projRectGenTester(m_logger.get()); - pass &= projRectGenTester.run(); - - CProjectedSphericalRectangleGeometricTester projRectGeoTester(m_logger.get()); - pass &= projRectGeoTester.run(); + auto check = [&](const char* id, const char* name) + { + if (!runControl.filter.shouldRun(id)) + { + m_logger->log("Skipping %s geometry tests due to filter.", ILogger::ELL_INFO, name); + return; + } + + const bool ok = Tester(m_logger.get()).run(); + geometryPass &= ok; + if (!ok) + failureManifest.addGroupFailure("geometry", id, name); + }; - CProjectedSphericalTriangleGeometricTester pstTester(m_logger.get()); - pass &= pstTester.run(); + check.template operator()("geometry/SolidAngleAccuracy", "SolidAngleAccuracy"); + check.template operator()("geometry/SphericalTriangleGenerate", "SphericalTriangleGenerate"); + check.template operator()("geometry/SphericalRectangleGenerate", "SphericalRectangleGenerate"); + check.template operator()("geometry/ProjectedSphericalRectangleGenerate", "ProjectedSphericalRectangleGenerate"); + check.template operator()("geometry/ProjectedSphericalRectangle", "ProjectedSphericalRectangle"); + check.template operator()("geometry/ProjectedSphericalTriangle", "ProjectedSphericalTriangle"); - if (pass) + pass &= geometryPass; + if (geometryPass) m_logger->log("All geometry tests PASSED.", ILogger::ELL_INFO); else m_logger->log("Some geometry tests FAILED.", ILogger::ELL_ERROR); } + if (!runControl.failedOutPath.empty()) + pass &= nbl::examples::testing::writeFailureManifestFile(failureManifest, runControl.failedOutPath, m_logger.get()); + return pass; } diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h index 87aac65ba..7665ebbb7 100644 --- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h +++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h @@ -6,13 +6,31 @@ #include "nbl/examples/Tester/ITester.h" #include "SamplerTestHelpers.h" -class CAliasTableGPUTester final : public ITester +// Shared GPU correctness harness for the packed alias variants. Labels for +// failed-field messages are selected from the Executor type at compile time. +template +class CPackedAliasTableGPUTester final : public ITester { - using base_t = ITester; - using R = AliasTableTestResults; + using base_t = ITester; + using R = AliasTableTestResults; + + using typename base_t::TestType; + using base_t::getRandomEngine; + using base_t::verifyTestValue; + using base_t::printTestFail; + + static constexpr bool kIsA = std::is_same_v; + static constexpr const char* kGeneratedIdxName = kIsA ? "PackedAliasA::generatedIndex" : "PackedAliasB::generatedIndex"; + static constexpr const char* kForwardPdfName = kIsA ? "PackedAliasA::forwardPdf" : "PackedAliasB::forwardPdf"; + static constexpr const char* kBackwardPdfName = kIsA ? "PackedAliasA::backwardPdf" : "PackedAliasB::backwardPdf"; + static constexpr const char* kForwardWeightName = kIsA ? "PackedAliasA::forwardWeight" : "PackedAliasB::forwardWeight"; + static constexpr const char* kBackwardWeightName = kIsA ? "PackedAliasA::backwardWeight" : "PackedAliasB::backwardWeight"; + static constexpr const char* kJacobianName = kIsA ? "PackedAliasA::jacobianProduct" : "PackedAliasB::jacobianProduct"; + static constexpr const char* kPdfConsistencyName = kIsA ? "PackedAliasA::pdf consistency" : "PackedAliasB::pdf consistency"; + static constexpr const char* kWeightConsistencyName = kIsA ? "PackedAliasA::weight consistency" : "PackedAliasB::weight consistency"; public: - CAliasTableGPUTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {} + CPackedAliasTableGPUTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {} private: AliasTableInputValues generateInputTestValues() override @@ -27,7 +45,7 @@ class CAliasTableGPUTester final : public ITester; +using CPackedAliasBGPUTester = CPackedAliasTableGPUTester; + #endif diff --git a/37_HLSLSamplingTests/tests/CBilinearTester.h b/37_HLSLSamplingTests/tests/CBilinearTester.h index 68605e90a..f5bea6896 100644 --- a/37_HLSLSamplingTests/tests/CBilinearTester.h +++ b/37_HLSLSamplingTests/tests/CBilinearTester.h @@ -14,7 +14,7 @@ class CBilinearTester final : public ITester #include #include +#include // Generic ReadOnly accessor wrapping a raw pointer template + requires std::is_arithmetic_v struct ReadOnlyAccessor { - using value_type = T; - template requires std::is_arithmetic_v - void get(I i, V& val) const { val = V(data[i]); } - T operator[](uint32_t i) const { return data[i]; } + using value_type = T; + template + requires std::is_arithmetic_v + void get(I i, V& val) const { val = V(data[i]); } - const T* data; + const T* data; }; -using ProbabilityAccessor = ReadOnlyAccessor; -using AliasIndexAccessor = ReadOnlyAccessor; -using PdfAccessor = ReadOnlyAccessor; - -using TestAliasTable = nbl::hlsl::sampling::AliasTable; -using TestCumulativeProbabilitySampler = nbl::hlsl::sampling::CumulativeProbabilitySampler>; - // Tests table construction for both alias method and cumulative probability. // Sampler generate/pdf correctness is verified by GPU testers (CAliasTableGPUTester, CCumulativeProbabilityGPUTester). class CDiscreteTableTester { -public: - CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {} - - bool run() - { - bool pass = true; - auto cases = createTestCases(); - - m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO); - for (const auto& tc : cases) - pass &= testAliasTable(tc.name, tc.weights); - - m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO); - for (const auto& tc : cases) - pass &= testCumulativeProbability(tc.name, tc.weights); - - return pass; - } - -private: - struct TestCase - { - const char* name; - std::vector weights; - }; - - static std::vector createTestCases() - { - std::vector cases; - cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}}); - cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}}); - - { - std::vector w(32, 1.0f); - w[31] = 97.0f; - cases.push_back({"SingleDominant(32)", std::move(w)}); - } - { - std::vector w(64); - for (uint32_t i = 0; i < 64; i++) - w[i] = 1.0f / float(i + 1); - cases.push_back({"PowerLaw(64)", std::move(w)}); - } - - cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}}); - - { - std::vector w(1024); - std::mt19937 rng(42); - std::uniform_real_distribution dist(0.001f, 100.0f); - for (uint32_t i = 0; i < 1024; i++) - w[i] = dist(rng); - cases.push_back({"Random(1024)", std::move(w)}); - } - - return cases; - } - - // Verify all values in array are in [0, 1] - bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const - { - bool pass = true; - for (uint32_t i = 0; i < count; i++) - { - if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f) - { - m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]", - system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]); - pass = false; - } - } - return pass; - } - - // Shared: verify PDFs sum to 1 and each matches weight/totalWeight - bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - float totalWeight = 0.0f; - for (uint32_t i = 0; i < N; i++) - totalWeight += weights[i]; - - bool pass = true; - - float pdfSum = 0.0f; - for (uint32_t i = 0; i < N; i++) - pdfSum += pdf[i]; - - if (std::abs(pdfSum - 1.0f) > 1e-5f) - { - m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum); - pass = false; - } - - for (uint32_t i = 0; i < N; i++) - { - const float expected = weights[i] / totalWeight; - const float err = std::abs(expected - pdf[i]); - if (err > 1e-6f) - { - m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err); - pass = false; - } - } - - return pass; - } - - // Verify alias table builder output: - // - bucket contributions reconstruct correct probabilities - // - PDFs sum to 1 and match weight/totalWeight - // - alias indices in range, probabilities in [0, 1] - bool testAliasTable(const char* name, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - - std::vector outProbability(N); - std::vector outAlias(N); - std::vector outPdf(N); - std::vector workspace(N); - - nbl::hlsl::sampling::AliasTableBuilder::build({ weights },outProbability.data(), outAlias.data(), outPdf.data(), workspace.data()); - - // Accumulate bucket contributions - std::vector dest(N, 0.0f); - for (uint32_t i = 0; i < N; i++) - { - dest[i] += outProbability[i]; - dest[outAlias[i]] += (1.0f - outProbability[i]); - } - - bool pass = true; - - float totalWeight = 0.0f; - for (uint32_t i = 0; i < N; i++) - totalWeight += weights[i]; - - for (uint32_t i = 0; i < N; i++) - { - const float expected = weights[i] / totalWeight * float(N); - const float err = std::abs(expected - dest[i]); - const float tolerance = std::max(1e-5f * float(N), 1e-4f); - - if (err > tolerance) - { - m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)", - system::ILogger::ELL_ERROR, name, i, expected, dest[i], err); - pass = false; - } - } - - // Alias indices in range - for (uint32_t i = 0; i < N; i++) - { - if (outAlias[i] >= N) - { - m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)", - system::ILogger::ELL_ERROR, name, i, outAlias[i], N); - pass = false; - } - } - - pass &= verifyPdf("AliasTable", name, outPdf.data(), weights); - pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), N); - - if (pass) - m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); - - return pass; - } - - // Verify CDF table construction: - // - cumulative probabilities are monotonically non-decreasing - // - PDFs match weight/totalWeight - // - PDFs sum to 1 - bool testCumulativeProbability(const char* name, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - - std::vector cumProb(N - 1); - - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram( - std::span(weights), - cumProb.data()); - - bool pass = true; - - // Monotonically non-decreasing - for (uint32_t i = 1; i < N - 1; i++) - { - if (cumProb[i] < cumProb[i - 1] - 1e-7f) - { - m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f", - system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]); - pass = false; - } - } - - // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0) - if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f) - { - m_logger->log("CumProb[%s] last stored entry %f >= 1.0", - system::ILogger::ELL_ERROR, name, cumProb[N - 2]); - pass = false; - } - - // Derive PDF from CDF for verification - std::vector pdf(N); - for (uint32_t i = 0; i < N; i++) - { - const float cur = (i < N - 1) ? cumProb[i] : 1.0f; - const float prev = (i > 0) ? cumProb[i - 1] : 0.0f; - pdf[i] = cur - prev; - } - - pass &= verifyPdf("CumProb", name, pdf.data(), weights); - pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1); - - if (pass) - m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); - - return pass; - } - - system::ILogger* m_logger; + public: + CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {} + + bool run() + { + bool pass = true; + auto cases = createTestCases(); + + m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testAliasTable(tc.name, tc.weights); + + m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testCumulativeProbability(tc.name, tc.weights); + + m_logger->log("CumulativeProbabilitySampler tests (TRACKING / YOLO / EYTZINGER):", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testSamplers(tc.name, tc.weights); + + return pass; + } + + private: + struct TestCase + { + const char* name; + std::vector weights; + }; + + static std::vector createTestCases() + { + std::vector cases; + cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}}); + cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}}); + + { + std::vector w(32, 1.0f); + w[31] = 97.0f; + cases.push_back({"SingleDominant(32)", std::move(w)}); + } + { + std::vector w(64); + for (uint32_t i = 0; i < 64; i++) + w[i] = 1.0f / float(i + 1); + cases.push_back({"PowerLaw(64)", std::move(w)}); + } + + cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}}); + + { + std::vector w(1024); + std::mt19937 rng(42); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < 1024; i++) + w[i] = dist(rng); + cases.push_back({"Random(1024)", std::move(w)}); + } + + // NPoT cases exercise EYTZINGER padded-leaf territory (P > N). + cases.push_back({"NonPot(7)", {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}}); + { + std::vector w(1000); + std::mt19937 rng(4242); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < 1000; i++) + w[i] = dist(rng); + cases.push_back({"Random(1000)", std::move(w)}); + } + + return cases; + } + + // Verify all values in array are in [0, 1] + bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const + { + bool pass = true; + for (uint32_t i = 0; i < count; i++) + { + if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f) + { + m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]", + system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]); + pass = false; + } + } + return pass; + } + + // Shared: verify PDFs sum to 1 and each matches weight/totalWeight + bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + float totalWeight = 0.0f; + for (uint32_t i = 0; i < N; i++) + totalWeight += weights[i]; + + bool pass = true; + + float pdfSum = 0.0f; + for (uint32_t i = 0; i < N; i++) + pdfSum += pdf[i]; + + if (std::abs(pdfSum - 1.0f) > 1e-5f) + { + m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum); + pass = false; + } + + for (uint32_t i = 0; i < N; i++) + { + const float expected = weights[i] / totalWeight; + const float err = std::abs(expected - pdf[i]); + if (err > 1e-6f) + { + m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err); + pass = false; + } + } + + return pass; + } + + // Verify alias table builder output: + // - bucket contributions reconstruct correct scaled probabilities + // - PDFs sum to 1 and match weight/totalWeight + // - alias indices in range, probabilities in [0, 1] + // Builder transparently pads PoT N to N+1; actual table size comes back + // as `tableN` and is what gets compared against. + bool testAliasTable(const char* name, const std::vector& weights) const + { + const uint32_t userN = static_cast(weights.size()); + + std::vector outProbability; + std::vector outAlias; + std::vector outPdf; + const uint32_t tableN = nbl::hlsl::sampling::AliasTableBuilder::build({weights}, outProbability, outAlias, outPdf); + + // Accumulate bucket contributions over the full (possibly padded) table + std::vector dest(tableN, 0.0f); + for (uint32_t i = 0; i < tableN; i++) + { + dest[i] += outProbability[i]; + dest[outAlias[i]] += (1.0f - outProbability[i]); + } + + bool pass = true; + + float totalWeight = 0.0f; + for (uint32_t i = 0; i < userN; i++) + totalWeight += weights[i]; + + // Real buckets: expected scaled prob = weight/total * tableN + for (uint32_t i = 0; i < userN; i++) + { + const float expected = weights[i] / totalWeight * float(tableN); + const float err = std::abs(expected - dest[i]); + const float tolerance = std::max(1e-5f * float(tableN), 1e-4f); + + if (err > tolerance) + { + m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)", + system::ILogger::ELL_ERROR, name, i, expected, dest[i], err); + pass = false; + } + } + + // Dummy bucket (only when padded): no real bucket aliases to it -> dest[userN] should be 0. + if (tableN != userN && std::abs(dest[userN]) > 1e-4f) + { + m_logger->log("AliasTable[%s] dummy bucket %u has non-zero reconstructed probability %f", + system::ILogger::ELL_ERROR, name, userN, dest[userN]); + pass = false; + } + + // Alias indices in range [0, tableN) + for (uint32_t i = 0; i < tableN; i++) + { + if (outAlias[i] >= tableN) + { + m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)", + system::ILogger::ELL_ERROR, name, i, outAlias[i], tableN); + pass = false; + } + } + + pass &= verifyPdf("AliasTable", name, outPdf.data(), weights); + pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), tableN); + + if (pass) + m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); + + return pass; + } + + // Verify CDF table construction: monotonicity, implicit-1.0 invariant, and + // stored entries in [0, 1]. PDF-from-CDF correctness is covered by the + // TRACKING sampler test below (same cdf[i] - cdf[i-1] derivation via + // sampler.backwardPdf), so it's not repeated here. + bool testCumulativeProbability(const char* name, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + + std::vector cumProb(N - 1); + + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram(std::span(weights), cumProb.data()); + + bool pass = true; + + // Monotonically non-decreasing + for (uint32_t i = 1; i < N - 1; i++) + { + if (cumProb[i] < cumProb[i - 1] - 1e-7f) + { + m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f", + system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]); + pass = false; + } + } + + // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0) + if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f) + { + m_logger->log("CumProb[%s] last stored entry %f >= 1.0", system::ILogger::ELL_ERROR, name, cumProb[N - 2]); + pass = false; + } + + pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1); + + if (pass) + m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); + + return pass; + } + + // Reference binary search over the full N-entry CDF (last entry == 1.0). + static uint32_t referenceUpperBound(const std::vector& fullCdf, float u) + { + auto it = std::upper_bound(fullCdf.begin(), fullCdf.end(), u); + return static_cast(std::distance(fullCdf.begin(), it)); + } + + // Run TRACKING, YOLO, and EYTZINGER samplers against the same reference + // distribution. Each mode is instantiated via the dual-compile sampler and + // exercised entirely on the CPU. + bool testSamplers(const char* name, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + if (N < 2) + return true; + + float totalWeight = 0.0f; + for (uint32_t i = 0; i < N; i++) + totalWeight += weights[i]; + const float rcpTotal = 1.0f / totalWeight; + + std::vector pdfRef(N); + std::vector fullCdf(N); + float acc = 0.0f; + for (uint32_t i = 0; i < N; i++) + { + pdfRef[i] = weights[i] * rcpTotal; + acc += pdfRef[i]; + fullCdf[i] = acc; + } + fullCdf[N - 1] = 1.0f; // pin the last entry; reference must treat it as exact + + // Storage for TRACKING / YOLO (N-1 entries, last bucket implicit at 1.0). + std::vector cdfStorage(N - 1); + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cdfStorage.data()); + + // Storage for EYTZINGER (2*P entries, level-order implicit binary tree). + const uint32_t P = nbl::hlsl::sampling::eytzingerLeafCount(N); + std::vector treeStorage(2u * P, 0.0f); + nbl::hlsl::sampling::buildEytzinger({weights}, treeStorage.data()); + + bool pass = true; + pass &= testSamplerMode("TRACKING", name, N, pdfRef, fullCdf, cdfStorage.data()); + pass &= testSamplerMode("YOLO", name, N, pdfRef, fullCdf, cdfStorage.data()); + pass &= testSamplerMode("EYTZINGER", name, N, pdfRef, fullCdf, treeStorage.data()); + return pass; + } + + template + bool testSamplerMode(const char* modeName, const char* caseName, uint32_t N, + const std::vector& pdfRef, const std::vector& fullCdf, const float* accessorData) const + { + using Sampler = nbl::hlsl::sampling::CumulativeProbabilitySampler< + float, float, uint32_t, ReadOnlyAccessor, Mode>; + + ReadOnlyAccessor accessor {accessorData}; + Sampler sampler = Sampler::create(accessor, N); + + bool pass = true; + + // backwardPdf(v) == pdfRef[v], and the implied PDF sums to 1. + float backwardSum = 0.0f; + for (uint32_t v = 0; v < N; v++) + { + const float got = sampler.backwardPdf(v); + const float expected = pdfRef[v]; + const float err = std::abs(got - expected); + const float tol = 1e-5f; + if (err > tol) + { + m_logger->log("Sampler[%s][%s] backwardPdf[%u]: expected %e, got %e (err=%e)", + system::ILogger::ELL_ERROR, modeName, caseName, v, expected, got, err); + pass = false; + } + backwardSum += got; + } + if (std::abs(backwardSum - 1.0f) > 1e-5f) + { + m_logger->log("Sampler[%s][%s] backwardPdf sum: expected 1.0, got %f", + system::ILogger::ELL_ERROR, modeName, caseName, backwardSum); + pass = false; + } + + // generate(u) lands in the correct bucket for a grid of u values, and + // generate(u, cache) produces forwardPdf matching backwardPdf(result). + std::mt19937 rng(1234u + N); + std::uniform_real_distribution udist(0.0f, std::nextafter(1.0f, 0.0f)); + constexpr uint32_t kTrials = 2048; + + for (uint32_t k = 0; k < kTrials; k++) + { + const float u = udist(rng); + const uint32_t ref = referenceUpperBound(fullCdf, u); + + const uint32_t idx = sampler.generate(u); + if (idx != ref) + { + m_logger->log("Sampler[%s][%s] generate(%.7f): expected bucket %u, got %u", + system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idx); + pass = false; + continue; + } + + typename Sampler::cache_type cache; + const uint32_t idxCache = sampler.generate(u, cache); + if (idxCache != ref) + { + m_logger->log("Sampler[%s][%s] generate(u,cache)(%.7f): expected %u, got %u", + system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idxCache); + pass = false; + continue; + } + + const float forwardP = sampler.forwardPdf(u, cache); + const float backwardP = sampler.backwardPdf(idxCache); + if (std::abs(forwardP - backwardP) > 1e-6f) + { + m_logger->log("Sampler[%s][%s] fwd/bwd pdf mismatch at u=%.7f bucket=%u: fwd=%e bwd=%e", + system::ILogger::ELL_ERROR, modeName, caseName, u, idxCache, forwardP, backwardP); + pass = false; + } + } + + if (pass) + m_logger->log(" [%-9s %s] PASSED", system::ILogger::ELL_PERFORMANCE, modeName, caseName); + return pass; + } + + system::ILogger* m_logger; }; #endif diff --git a/37_HLSLSamplingTests/tests/CLinearTester.h b/37_HLSLSamplingTests/tests/CLinearTester.h index 631151f00..394b68721 100644 --- a/37_HLSLSamplingTests/tests/CLinearTester.h +++ b/37_HLSLSamplingTests/tests/CLinearTester.h @@ -14,7 +14,7 @@ class CLinearTester final : public ITesterlog(" coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR, - to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str()); + to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str()); } }; @@ -140,7 +141,7 @@ struct LinearStressConfig { using nbl::system::to_string; logger->log(" coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR, - to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str()); + to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str()); } }; diff --git a/37_HLSLSamplingTests/tests/CPolarMappingTester.h b/37_HLSLSamplingTests/tests/CPolarMappingTester.h index f7009176b..13971e186 100644 --- a/37_HLSLSamplingTests/tests/CPolarMappingTester.h +++ b/37_HLSLSamplingTests/tests/CPolarMappingTester.h @@ -14,7 +14,7 @@ class CPolarMappingTester final : public ITester sizeDist(0.5f, 3.0f); std::uniform_real_distribution uDist(0.0f, 1.0f); - ProjectedSphericalRectangleInputValues input; - // Observer at origin, rect placed in front (negative Z) so the solid angle is valid. - input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f); - const float width = sizeDist(getRandomEngine()); - const float height = sizeDist(getRandomEngine()); - input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f); - input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f); - input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f); - - // Build shape to use centralized corner check nbl::hlsl::shapes::CompressedSphericalRectangle compressed; - compressed.origin = input.rectOrigin; - compressed.right = input.right; - compressed.up = input.up; + nbl::hlsl::float32_t3 observer; + generateRandomRectangle(getRandomEngine(), compressed, observer); + + ProjectedSphericalRectangleInputValues input; + input.observer = observer; + input.rectOrigin = compressed.origin; + input.right = compressed.right; + input.up = compressed.up; + auto shape = nbl::hlsl::shapes::SphericalRectangle::create(compressed); // Ensure the receiver normal has positive projection onto at least one vertex, @@ -63,25 +58,25 @@ class CProjectedSphericalRectangleTester final : public ITester actual.extents.x || - actual.surfaceOffset.y < 0.0f || actual.surfaceOffset.y > actual.extents.y) + PdfCheck {"ProjectedSphericalRectangle::forwardPdf", &R::forwardPdf}); + VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedSphericalRectangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2); + + constexpr float boundsEps = 1e-5f; + if (actual.surfaceOffset.x < -boundsEps || actual.surfaceOffset.x > actual.extents.x + boundsEps || + actual.surfaceOffset.y < -boundsEps || actual.surfaceOffset.y > actual.extents.y + boundsEps) { pass = false; - printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, 0.0); + printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, boundsEps); } // generate must be unit length @@ -90,7 +85,7 @@ class CProjectedSphericalRectangleTester final : public ITester createProjectedRectSampler( +inline nbl::hlsl::sampling::ProjectedSphericalRectangle createProjectedRectSampler( std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, nbl::hlsl::float32_t3& observer, @@ -121,15 +116,16 @@ inline nbl::hlsl::sampling::ProjectedSphericalRectangle cr outNormal = generateRandomUnitVector(rng); } while (!anyRectCornerAboveHorizon(shape, observer, outNormal)); - return sampling::ProjectedSphericalRectangle::create(shape, observer, outNormal, false); + return sampling::ProjectedSphericalRectangle::create(shape, observer, outNormal, false); } struct ProjectedSphericalRectanglePropertyConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; static constexpr uint32_t numConfigurations = 200; - static constexpr uint32_t samplesPerConfig = 20000; + static constexpr uint32_t samplesPerConfig = 50000; static constexpr bool hasMCNormalization = true; static constexpr bool hasGridIntegration = false; static constexpr float64_t mcNormalizationRelTol = 0.08; @@ -155,23 +151,20 @@ struct ProjectedSphericalRectanglePropertyConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { using nbl::system::to_string; - logger->log(" r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s", + logger->log(" r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s", nbl::system::ILogger::ELL_ERROR, to_string(s.sphrect.r0).c_str(), to_string(s.sphrect.extents).c_str(), to_string(s.sphrect.solidAngle).c_str(), - to_string(s.rcpSolidAngle).c_str(), - to_string(s.rcpProjSolidAngle).c_str()); - logger->log(" localReceiverNormal=%s receiverWasBSDF=%u", - nbl::system::ILogger::ELL_ERROR, - to_string(s.localReceiverNormal).c_str(), - static_cast(s.receiverWasBSDF)); + to_string(s.projSolidAngle).c_str(), + to_string(s.receiverNormal).c_str()); } }; struct ProjectedSphericalRectangleGrazingConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -202,17 +195,13 @@ struct ProjectedSphericalRectangleGrazingConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { using nbl::system::to_string; - logger->log(" r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s", + logger->log(" r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s", nbl::system::ILogger::ELL_ERROR, to_string(s.sphrect.r0).c_str(), to_string(s.sphrect.extents).c_str(), to_string(s.sphrect.solidAngle).c_str(), - to_string(s.rcpSolidAngle).c_str(), - to_string(s.rcpProjSolidAngle).c_str()); - logger->log(" localReceiverNormal=%s receiverWasBSDF=%u", - nbl::system::ILogger::ELL_ERROR, - to_string(s.localReceiverNormal).c_str(), - static_cast(s.receiverWasBSDF)); + to_string(s.projSolidAngle).c_str(), + to_string(s.receiverNormal).c_str()); } }; diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h index 31f85ba02..611fa1f3c 100644 --- a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h +++ b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h @@ -14,7 +14,7 @@ class CProjectedSphericalTriangleTester final : public ITester; + // UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -117,18 +120,19 @@ struct ProjectedSphericalTrianglePropertyConfig // E[1/pdf] = solidAngle * E[1/bilinearPdf] = solidAngle * 1.0 = solidAngle static float64_t expectedCodomainMeasure(const sampler_type& s) { - return 1.0 / static_cast(s.sphtri.base.rcpSolidAngle); + return 1.0 / static_cast(s.sphtri.rcpSolidAngle); } static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal); + logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal); } }; struct ProjectedSphericalTriangleGrazingConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; + // UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -169,12 +173,12 @@ struct ProjectedSphericalTriangleGrazingConfig static float64_t expectedCodomainMeasure(const sampler_type& s) { - return 1.0 / static_cast(s.sphtri.base.rcpSolidAngle); + return 1.0 / static_cast(s.sphtri.rcpSolidAngle); } static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal); + logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal); } }; diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h index 2a6030b78..7aabc48ec 100644 --- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h +++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h @@ -15,22 +15,22 @@ class CSphericalRectangleTester final : public ITester sizeDist(0.5f, 3.0f); std::uniform_real_distribution uDist(0.0f, 1.0f); + nbl::hlsl::shapes::CompressedSphericalRectangle compressed; + nbl::hlsl::float32_t3 observer; + generateRandomRectangle(getRandomEngine(), compressed, observer); + SphericalRectangleInputValues input; - // Observer at origin, rect placed in front (negative Z) so the solid angle is valid. - input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f); - const float width = sizeDist(getRandomEngine()); - const float height = sizeDist(getRandomEngine()); - input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f); - input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f); - input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f); + input.observer = observer; + input.rectOrigin = compressed.origin; + input.right = compressed.right; + input.up = compressed.up; input.u = nbl::hlsl::float32_t2(uDist(getRandomEngine()), uDist(getRandomEngine())); m_inputs.push_back(input); return input; @@ -48,16 +48,25 @@ class CSphericalRectangleTester final : public ITester world == generate", actual.generated, actual.normalizedLocalToWorld, iteration, seed, testType, 5e-5, 5e-3); + // computeHitT(generated) must equal hitDist returned by generateNormalizedLocal + pass &= verifyTestValue("SphericalRectangle::computeHitT == hitDist", actual.computedHitT, actual.hitDist, iteration, seed, testType, 5e-4, 2e-2); + // generateUnnormalized direction must be parallel to generate() (cross product near zero) + { + const nbl::hlsl::float32_t3 c = nbl::hlsl::cross(actual.unnormalized, actual.generated); + pass &= verifyTestValue("SphericalRectangle::generateUnnormalized parallel to generate", c, nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f), iteration, seed, testType, 1e-3, 5e-2); + } + // |generateUnnormalized| must equal hitDist (distance to hitpoint along the unit ray) + { + const float ulen = nbl::hlsl::length(actual.unnormalized); + pass &= verifyTestValue("SphericalRectangle::|generateUnnormalized| == hitDist", ulen, actual.hitDist, iteration, seed, testType, 5e-4, 2e-2); + } + if (!pass && iteration < m_inputs.size()) logFailedInput(m_logger.get(), m_inputs[iteration]); diff --git a/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h index fd8a0f63e..68dd2310b 100644 --- a/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h +++ b/37_HLSLSamplingTests/tests/CSphericalTriangleTester.h @@ -14,7 +14,7 @@ class CSphericalTriangleTester final : public ITester; + using sampler_type = nbl::hlsl::sampling::SphericalTriangle; static constexpr uint32_t numConfigurations = 500; static constexpr uint32_t samplesPerConfig = 20000; @@ -121,7 +124,7 @@ struct SphericalTrianglePropertyConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC); + logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]); } }; @@ -130,7 +133,7 @@ struct SphericalTrianglePropertyConfig // These stress the C_s great-circle intersection and v-recovery in generateInverse. struct SphericalTriangleStressConfig { - using sampler_type = nbl::hlsl::sampling::SphericalTriangle; + using sampler_type = nbl::hlsl::sampling::SphericalTriangle; static constexpr uint32_t numConfigurations = 500; static constexpr uint32_t samplesPerConfig = 20000; @@ -218,7 +221,7 @@ struct SphericalTriangleStressConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC); + logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]); } }; diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h index 29994511f..4f80ecbaf 100644 --- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h +++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h @@ -12,7 +12,7 @@ class CUniformHemisphereTester final : public ITester #include +#include + // ============================================================================ // Declarative field verification helpers // @@ -34,30 +36,126 @@ struct PdfCheck // Verify expected.*field vs actual.*field for each FieldCheck. // Must be called from within a method that has access to verifyTestValue. -#define VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType, ...) \ - do \ - { \ - auto _checks = std::make_tuple(__VA_ARGS__); \ - std::apply([&](const auto&... c) { ((pass &= verifyTestValue(c.name, (expected).*c.field, (actual).*c.field, \ - iteration, seed, testType, c.relTol, c.absTol)), \ +#define VERIFY_FIELDS(pass, expected, actual, iteration, seed, testType, ...) \ + do \ + { \ + auto _checks = std::make_tuple(__VA_ARGS__); \ + std::apply([&](const auto&... c) { ((pass &= verifyTestValue(c.name, (expected).*c.field, (actual).*c.field, \ + iteration, seed, testType, c.relTol, c.absTol)), \ ...); }, _checks); \ } while (0) +// ============================================================================ +// Jacobian skip tracking +// +// The device-side sampler writes a reason-encoded skip sentinel (see +// jacobian_test.hlsl) instead of a jacobianProduct value when it cannot test +// a sample honestly. The host recognizes the sentinel, bins it by reason, +// and NEVER counts it as a pass. After all tests run, logJacobianSkipCounts() +// reports per-reason counts so nothing silently inflates pass rates. +// ============================================================================ + +namespace detail +{ +struct JacobianStats +{ + uint64_t total = 0; // total VERIFY_JACOBIAN_OR_SKIP invocations (= samples evaluated) + uint64_t skipUDomain = 0; // JACOBIAN_SKIP_U_DOMAIN = -1.0f + uint64_t skipCrease = 0; // JACOBIAN_SKIP_CREASE = -2.0f + uint64_t skipHemiBoundary = 0; // JACOBIAN_SKIP_HEMI_BOUNDARY = -3.0f + uint64_t skipBwdPdfRange = 0; // JACOBIAN_SKIP_BWD_PDF_RANGE = -4.0f + uint64_t skipCodomainSingularity = 0; // JACOBIAN_SKIP_CODOMAIN_SINGULARITY = -5.0f +}; + +inline nbl::core::map& jacobianStats() +{ + static nbl::core::map s; + return s; +} +} // namespace detail + +inline void logJacobianSkipCounts(nbl::system::ILogger* logger) +{ + auto& stats = detail::jacobianStats(); + if (stats.empty()) + return; + logger->log("Jacobian skip summary (skipped samples are NOT counted as passes):", nbl::system::ILogger::ELL_INFO); + for (const auto& [name, s] : stats) + { + const uint64_t skipped = s.skipUDomain + s.skipCrease + s.skipHemiBoundary + s.skipBwdPdfRange + s.skipCodomainSingularity; + if (skipped == 0) + continue; + const double percentage = s.total ? (100.0 * double(skipped) / double(s.total)) : 0.0; + logger->log(" [JacobianSkip] %s: %llu / %llu skipped (%.2f%%) -- u-domain=%llu, crease=%llu, hemi-boundary=%llu, bwd-pdf-range=%llu, codomain-singularity=%llu", + nbl::system::ILogger::ELL_WARNING, + name.c_str(), + skipped, + s.total, + percentage, + s.skipUDomain, + s.skipCrease, + s.skipHemiBoundary, + s.skipBwdPdfRange, + s.skipCodomainSingularity); + } +} + +// Verify a jacobianProduct value OR bin it by reason if it is a skip sentinel (< 0). +// Skipped samples are counted by reason and NEVER counted as a pass. +// Must be called from a method that has access to verifyTestValue. +#define VERIFY_JACOBIAN_OR_SKIP(pass, name, expected, actual, iteration, seed, testType, relTol, absTol) \ + do \ + { \ + auto& _jstats = detail::jacobianStats()[(name)]; \ + ++_jstats.total; \ + const float _jval = (actual); \ + if (_jval < 0.0f) \ + { \ + /* Sentinel values are integers at -1..-5, so round-to-nearest on _jval picks the bin. */ \ + const int _bin = static_cast(-_jval + 0.5f); \ + switch (_bin) \ + { \ + case 1: \ + ++_jstats.skipUDomain; \ + break; \ + case 2: \ + ++_jstats.skipCrease; \ + break; \ + case 3: \ + ++_jstats.skipHemiBoundary; \ + break; \ + case 4: \ + ++_jstats.skipBwdPdfRange; \ + break; \ + case 5: \ + ++_jstats.skipCodomainSingularity; \ + break; \ + default: \ + ++_jstats.skipUDomain; \ + break; /* fall-through bucket */ \ + } \ + } \ + else \ + { \ + pass &= verifyTestValue((name), (expected), _jval, (iteration), (seed), (testType), (relTol), (absTol)); \ + } \ + } while (0) + // Check that each PDF field is positive and finite. // Must be called from within a method that has access to printTestFail. -#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \ - do \ - { \ - auto _pdfChecks = std::make_tuple(__VA_ARGS__); \ - std::apply([&](const auto&... c) { (([&] { \ +#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \ + do \ + { \ + auto _pdfChecks = std::make_tuple(__VA_ARGS__); \ + std::apply([&](const auto&... c) { (([&] { \ if (!((actual).*c.field > 0.0f) || !std::isfinite((actual).*c.field)) \ - { \ - pass = false; \ - printTestFail(std::string(c.name) + " (positive & finite)", \ - 1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \ - } \ - }()), \ - ...); }, _pdfChecks); \ + { \ + pass = false; \ + printTestFail(std::string(c.name) + " (positive & finite)", \ + 1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \ + } \ + }()), \ + ...); }, _pdfChecks); \ } while (0) // ============================================================================ @@ -139,7 +237,7 @@ inline float64_t gridIntegratePdf1D(const auto& sampler, uint32_t N = 100000) // 2D grid integration of backwardPdf over [0,1]^2 inline float64_t gridIntegratePdf2D(const auto& sampler, uint32_t N = 1000) { - float64_t sum = 0.0; + float64_t sum = 0.0; const float64_t cellArea = 1.0 / static_cast(N * N); for (uint32_t iy = 0; iy < N; iy++) { @@ -190,17 +288,15 @@ inline void buildTangentFrame(nbl::hlsl::float32_t3 dir, nbl::hlsl::float32_t3& // Generate a small equilateral triangle on the unit sphere around baseDir with given half-angle. // Also generates a random normal with decent projection onto the triangle. -inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, - nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, - nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal) +inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; baseDir = generateRandomUnitVector(rng); float32_t3 t1, t2; buildTangentFrame(baseDir, t1, t2); - v0 = normalize(baseDir + t1 * halfAngle); - v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f)); - v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f)); + v0 = normalize(baseDir + t1 * halfAngle); + v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f)); + v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f)); normal = generateRandomUnitVector(rng); if (dot(normal, baseDir) < 0.1f) normal = normalize(normal + baseDir * 2.0f); @@ -221,10 +317,10 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(base, t1, t2); float spread = 0.15f + angleDist(rng) * 0.2f; - v0 = normalize(base + t1 * spread); - v1 = normalize(base - t1 * spread); - float far_ = 0.8f + angleDist(rng) * 0.8f; - v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_)); + v0 = normalize(base + t1 * spread); + v1 = normalize(base - t1 * spread); + float far_ = 0.8f + angleDist(rng) * 0.8f; + v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_)); break; } case 1: // Nearly coplanar @@ -233,12 +329,12 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(pole, t1, t2); float offset = 0.05f + angleDist(rng) * 0.1f; - float a1 = angleDist(rng) * 6.2832f; - float a2 = a1 + 0.8f + angleDist(rng); - float a3 = a2 + 0.8f + angleDist(rng); - v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset); - v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f); - v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f); + float a1 = angleDist(rng) * 6.2832f; + float a2 = a1 + 0.8f + angleDist(rng); + float a3 = a2 + 0.8f + angleDist(rng); + v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset); + v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f); + v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f); break; } default: // One short edge @@ -247,9 +343,9 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(base, t1, t2); float shortAngle = 0.32f + angleDist(rng) * 0.1f; - v0 = normalize(base + t1 * shortAngle * 0.5f); - v1 = normalize(base - t1 * shortAngle * 0.5f); - v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f)); + v0 = normalize(base + t1 * shortAngle * 0.5f); + v1 = normalize(base - t1 * shortAngle * 0.5f); + v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f)); break; } } @@ -262,65 +358,114 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 inline void makeEquilateralTriangle(float64_t theta, nbl::hlsl::float32_t3 verts[3]) { using namespace nbl::hlsl; - const float32_t st = static_cast(std::sin(theta)); - const float32_t ct = static_cast(std::cos(theta)); + const float32_t st = static_cast(std::sin(theta)); + const float32_t ct = static_cast(std::cos(theta)); constexpr float64_t twoPiOver3 = 2.0 * numbers::pi / 3.0; - verts[0] = float32_t3(st, 0.0f, ct); - verts[1] = float32_t3(static_cast(st * std::cos(twoPiOver3)), + verts[0] = float32_t3(st, 0.0f, ct); + verts[1] = float32_t3(static_cast(st * std::cos(twoPiOver3)), static_cast(st * std::sin(twoPiOver3)), ct); - verts[2] = float32_t3(static_cast(st * std::cos(2.0 * twoPiOver3)), + verts[2] = float32_t3(static_cast(st * std::cos(2.0 * twoPiOver3)), static_cast(st * std::sin(2.0 * twoPiOver3)), ct); } -// Monte Carlo estimate of projected solid angle: E[abs(dot(L, normal))] * solidAngle. -// Uses abs() to match the BSDF projected solid angle formula (which uses abs so that -// triangles straddling the horizon contribute positively from both hemispheres). -// Samples L uniformly from the spherical triangle. -inline float64_t mcEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle& shape, nbl::hlsl::float32_t3 normal, uint32_t N, std::mt19937& rng) +// Grid estimate of projected solid angle: mean of abs(dot(L, normal)) over a regular +// [0,1]^2 grid, times solidAngle. Uses abs() to match the BSDF projected solid angle +// formula (triangles/rects straddling the horizon contribute from both hemispheres). +// `N` is the total number of samples; the grid side is ceil(sqrt(N)). Grid integration +// is deterministic and has much lower variance than MC at the same sample count, +// so it's a tighter ground truth for PSA-vs-formula comparisons. +inline float64_t gridEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle& shape, nbl::hlsl::float32_t3 normal, uint32_t N) { using namespace nbl::hlsl; - auto sampler = sampling::SphericalTriangle::create(shape); - std::uniform_real_distribution uDist(0.0f, 1.0f); - float64_t sum = 0.0; - for (uint32_t i = 0; i < N; i++) + auto sampler = sampling::SphericalTriangle::create(shape); + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float invSide = 1.0f / static_cast(gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) + { + const float uy = (static_cast(iy) + 0.5f) * invSide; + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float ux = (static_cast(ix) + 0.5f) * invSide; + typename sampling::SphericalTriangle::cache_type cache; + const float32_t3 L = sampler.generate(float32_t2(ux, uy), cache); + sum += static_cast(hlsl::abs(dot(normal, L))); + } + } + return sum / static_cast(gridSide * gridSide) * static_cast(shape.solid_angle); +} + +// Sampler-independent PSA reference for rectangles. Integrates the projected-solid-angle integral +// PSA = integral over rect surface of |cos(theta_receiver)| * |cos(theta_rect)| / d^2 dA +// on a uniform surface grid in (s, t) in [0, extents.x] x [0, extents.y]. No sampler involved, +// so disagreement with a sampler-derived PSA isolates the sampler / formula. +inline float64_t surfaceGridEstimatePSA( + const nbl::hlsl::shapes::SphericalRectangle& shape, + const nbl::hlsl::float32_t3& observer, + const nbl::hlsl::float32_t3& normal, + uint32_t N) +{ + using namespace nbl::hlsl; + const float32_t3 rdir = shape.basis[0]; + const float32_t3 udir = shape.basis[1]; + const float32_t3 rectNormal = shape.basis[2]; + const float32_t width = shape.extents.x; + const float32_t height = shape.extents.y; + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float64_t cellArea = static_cast(width) * static_cast(height) / static_cast(gridSide * gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) { - float32_t2 u(uDist(rng), uDist(rng)); - typename sampling::SphericalTriangle::cache_type cache; - float32_t3 L = sampler.generate(u, cache); - sum += static_cast(hlsl::abs(dot(normal, L))); + const float32_t t = (static_cast(iy) + 0.5f) * height / static_cast(gridSide); + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float32_t s = (static_cast(ix) + 0.5f) * width / static_cast(gridSide); + const float32_t3 worldPt = shape.origin + rdir * s + udir * t; + const float32_t3 toSurf = worldPt - observer; + const float64_t d2 = static_cast(dot(toSurf, toSurf)); + const float64_t d = std::sqrt(d2); + const float32_t3 L = toSurf * static_cast(1.0 / d); + const float64_t cosRx = static_cast(hlsl::abs(dot(normal, L))); + const float64_t cosRt = static_cast(hlsl::abs(dot(rectNormal, L))); + sum += cosRx * cosRt / d2; + } } - return sum / static_cast(N) * static_cast(shape.solid_angle); + return sum * cellArea; } -// Monte Carlo estimate of projected solid angle for a rectangle: E[abs(dot(L, normal))] * solidAngle. -// Uses abs() to match the BSDF projected solid angle formula. -// Samples uniformly from the spherical rectangle, reconstructs world-space direction. -inline float64_t mcEstimatePSA( +// Grid estimate of projected solid angle for a rectangle: mean of abs(dot(L, normal)) +// over a regular [0,1]^2 grid, times solidAngle. See the triangle overload above. +inline float64_t gridEstimatePSA( const nbl::hlsl::shapes::SphericalRectangle& shape, const nbl::hlsl::float32_t3& observer, const nbl::hlsl::float32_t3& normal, - uint32_t N, std::mt19937& rng) + uint32_t N) { using namespace nbl::hlsl; auto sampler = sampling::SphericalRectangle::create(shape, observer); if (sampler.solidAngle <= 0.0f || !std::isfinite(sampler.solidAngle)) return 0.0; - std::uniform_real_distribution uDist(0.0f, 1.0f); - float64_t sum = 0.0; - for (uint32_t i = 0; i < N; i++) + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float invSide = 1.0f / static_cast(gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) { - float32_t2 u(uDist(rng), uDist(rng)); - typename sampling::SphericalRectangle::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); - // Reconstruct world-space direction from rectangle offset - float32_t3 worldPt = shape.origin - + shape.basis[0] * gen.x - + shape.basis[1] * gen.y; - float32_t3 L = normalize(worldPt - observer); - sum += static_cast(hlsl::abs(dot(normal, L))); + const float uy = (static_cast(iy) + 0.5f) * invSide; + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float ux = (static_cast(ix) + 0.5f) * invSide; + typename sampling::SphericalRectangle::cache_type cache; + // `generateLocalBasisXY` returns absolute (xu, yv) on the rectangle surface; subtract r0.xy + // to get the offset-from-r0 that the world-space reconstruction below expects. + const float32_t2 absXY = sampler.generateLocalBasisXY(float32_t2(ux, uy), cache); + const float32_t2 gen = absXY - float32_t2(sampler.r0.x, sampler.r0.y); + const float32_t3 worldPt = shape.origin + shape.basis[0] * gen.x + shape.basis[1] * gen.y; + const float32_t3 L = normalize(worldPt - observer); + sum += static_cast(hlsl::abs(dot(normal, L))); + } } - return sum / static_cast(N) * static_cast(sampler.solidAngle); + return sum / static_cast(gridSide * gridSide) * static_cast(sampler.solidAngle); } // Bundles seed + rng + failCount for randomized property tests. @@ -332,7 +477,7 @@ struct SeededTestContext std::mt19937 rng; uint32_t failCount = 0; - SeededTestContext() : seed(std::random_device {}()), rng(seed) {} + SeededTestContext(std::optional seedOverride = {}) : seed(seedOverride.value_or(std::random_device {}())), rng(seed) {} // Log "reproduce with seed" if failCount > 0, return failCount == 0 bool finalize(nbl::system::ILogger* logger, const char* tag) const @@ -357,14 +502,18 @@ struct SeededTestContext } }; -// Generic PSA vs MC comparison. -// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& mcPSA, InfoLogger& info) -// Must set formulaPSA and mcPSA for config `index`, or set both to 0 to skip. +// Generic PSA vs grid-integration comparison. +// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& gridPSA, InfoLogger& info) +// Must set formulaPSA and gridPSA for config `index`, or set both to 0 to skip. // `info` is a callable: void(nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) that logs // sampler/shape details for the current config. Called on mismatch. -// When diagnostic=true, failures log at ELL_WARNING instead of ELL_ERROR (non-hard-fail). +// Two-tier tolerance: +// - (relTol, absTol): soft threshold. Exceedance counts as a mismatch. With diagnostic=true +// the run still returns true (known-limitation noise); with diagnostic=false it hard-fails. +// - (hardRelTol, hardAbsTol): egregious threshold. Always hard-fails regardless of diagnostic, +// so a catastrophic regression can't hide inside the warning stream. template -inline bool testPSAVersusMonteCarlo( +inline bool testPSAVersusGrid( nbl::system::ILogger* logger, const char* tag, const char* label, @@ -372,49 +521,78 @@ inline bool testPSAVersusMonteCarlo( uint32_t numConfigs, float64_t relTol, float64_t absTol, + float64_t hardRelTol, + float64_t hardAbsTol, bool diagnostic = false) { - const auto failLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR; + const auto softFailLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR; SeededTestContext ctx; + uint32_t hardFailCount = 0; + uint32_t testedCount = 0; for (uint32_t c = 0; c < numConfigs; c++) { - float64_t formulaPSA = 0.0, mcPSA = 0.0; + float64_t formulaPSA = 0.0, gridPSA = 0.0; std::function logInfo = - [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) {}; - configGenerator(ctx.rng, c, formulaPSA, mcPSA, logInfo); + [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) { + }; + configGenerator(ctx.rng, c, formulaPSA, gridPSA, logInfo); - if (mcPSA == 0.0 && formulaPSA == 0.0) + if (gridPSA == 0.0 && formulaPSA == 0.0) continue; + testedCount++; - const float64_t absErr = std::abs(formulaPSA - mcPSA); - const float64_t relErr = (std::abs(mcPSA) > 1e-10) ? absErr / std::abs(mcPSA) : 0.0; + const float64_t absErr = std::abs(formulaPSA - gridPSA); + const float64_t relErr = (std::abs(gridPSA) > 1e-10) ? absErr / std::abs(gridPSA) : 0.0; - if (relErr > relTol && absErr > absTol) + const bool softFail = relErr > relTol && absErr > absTol; + const bool hardFail = relErr > hardRelTol && absErr > hardAbsTol; + + if (softFail) { ctx.failCount++; + if (hardFail) + hardFailCount++; if (ctx.failCount <= 5) { - logger->log(" [%s] %s mismatch: formula=%f expected(MC)=%f relErr=%e absErr=%e config %u", - failLevel, tag, label, formulaPSA, mcPSA, relErr, absErr, c); - logInfo(logger, failLevel); + const auto level = hardFail ? nbl::system::ILogger::ELL_ERROR : softFailLevel; + logger->log(" [%s] %s %s: formula=%f expected(grid)=%f relErr=%e absErr=%e config %u", + level, tag, label, hardFail ? "HARD mismatch" : "mismatch", + formulaPSA, gridPSA, relErr, absErr, c); + logInfo(logger, level); } } } + const uint32_t skippedCount = numConfigs - testedCount; + if (ctx.failCount == 0) - logger->log(" [%s] %s PASSED (%u configs, relTol=%e absTol=%e)", - nbl::system::ILogger::ELL_PERFORMANCE, tag, label, numConfigs, relTol, absTol); - else { - logger->log(" [%s] %s FAILED (%u/%u configs exceeded tolerance, relTol=%e absTol=%e)", - failLevel, tag, label, ctx.failCount, numConfigs, relTol, absTol); - if (diagnostic) - logger->log(" [%s] reproduce with seed=%u (diagnostic only, not a hard failure)", - nbl::system::ILogger::ELL_WARNING, tag, ctx.seed); + logger->log(" [%s] %s PASSED (%u tested, %u skipped of %u requested, relTol=%e absTol=%e)", + nbl::system::ILogger::ELL_PERFORMANCE, tag, label, + testedCount, skippedCount, numConfigs, relTol, absTol); + return true; } - return diagnostic ? true : ctx.finalize(logger, tag); + const bool hardFailed = hardFailCount > 0; + const auto summaryLevel = hardFailed ? nbl::system::ILogger::ELL_ERROR : softFailLevel; + if (hardFailed) + logger->log(" [%s] %s FAILED (%u/%u exceeded soft tol, %u/%u exceeded HARD tol, %u skipped of %u, hardRelTol=%e hardAbsTol=%e)", + summaryLevel, tag, label, ctx.failCount, testedCount, hardFailCount, testedCount, + skippedCount, numConfigs, hardRelTol, hardAbsTol); + else + logger->log(" [%s] %s FAILED (%u/%u configs exceeded tolerance, %u skipped of %u, relTol=%e absTol=%e)", + summaryLevel, tag, label, ctx.failCount, testedCount, skippedCount, numConfigs, relTol, absTol); + + const bool shouldHardFail = hardFailed || !diagnostic; + if (shouldHardFail) + logger->log(" [%s] reproduce with seed=%u", + nbl::system::ILogger::ELL_ERROR, tag, ctx.seed); + else + logger->log(" [%s] reproduce with seed=%u (diagnostic only, not a hard failure)", + nbl::system::ILogger::ELL_WARNING, tag, ctx.seed); + + return !shouldHardFail; } // ============================================================================ @@ -435,23 +613,21 @@ inline void generateRandomRectangle(std::mt19937& rng, float32_t3 t1, t2; buildTangentFrame(normal, t1, t2); - const float width = sizeDist(rng); + const float width = sizeDist(rng); const float height = sizeDist(rng); - const float dist = distDist(rng); + const float dist = distDist(rng); - observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng)); + observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng)); compressed.origin = observer - normal * dist + t1 * offsetDist(rng) + t2 * offsetDist(rng); - compressed.right = t1 * width; - compressed.up = t2 * height; + compressed.right = t1 * width; + compressed.up = t2 * height; } // Stress rectangles: ill-conditioned geometries that exercise edge cases. // - Extreme aspect ratio (10:1 to 20:1) // - Grazing angle (observer nearly in the rectangle plane) // - Observer near corner (most of the rectangle off to one side) -inline void generateStressRectangle(std::mt19937& rng, - nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, - nbl::hlsl::float32_t3& observer) +inline void generateStressRectangle(std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, nbl::hlsl::float32_t3& observer) { using namespace nbl::hlsl; std::uniform_real_distribution uDist(0.0f, 1.0f); @@ -464,39 +640,39 @@ inline void generateStressRectangle(std::mt19937& rng, switch (caseDist(rng)) { case 0: // Extreme aspect ratio - { - const float longSide = 3.0f + uDist(rng) * 5.0f; - const float shortSide = 0.1f + uDist(rng) * 0.2f; - const float dist = 1.5f + uDist(rng) * 2.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f); - compressed.right = t1 * longSide; - compressed.up = t2 * shortSide; - break; - } + { + const float longSide = 3.0f + uDist(rng) * 5.0f; + const float shortSide = 0.1f + uDist(rng) * 0.2f; + const float dist = 1.5f + uDist(rng) * 2.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f); + compressed.right = t1 * longSide; + compressed.up = t2 * shortSide; + break; + } case 1: // Grazing angle (observer nearly in the rectangle plane) - { - const float width = 1.0f + uDist(rng) * 2.0f; - const float height = 1.0f + uDist(rng) * 2.0f; - const float normalDist = 0.05f + uDist(rng) * 0.15f; - const float tangentOffset = 0.5f + uDist(rng) * 1.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f); - compressed.right = t1 * width; - compressed.up = t2 * height; - break; - } + { + const float width = 1.0f + uDist(rng) * 2.0f; + const float height = 1.0f + uDist(rng) * 2.0f; + const float normalDist = 0.05f + uDist(rng) * 0.15f; + const float tangentOffset = 0.5f + uDist(rng) * 1.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f); + compressed.right = t1 * width; + compressed.up = t2 * height; + break; + } default: // Observer near corner - { - const float width = 2.0f + uDist(rng) * 3.0f; - const float height = 2.0f + uDist(rng) * 3.0f; - const float dist = 0.5f + uDist(rng) * 1.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f); - compressed.right = t1 * width; - compressed.up = t2 * height; - break; - } + { + const float width = 2.0f + uDist(rng) * 3.0f; + const float height = 2.0f + uDist(rng) * 3.0f; + const float dist = 0.5f + uDist(rng) * 1.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f); + compressed.right = t1 * width; + compressed.up = t2 * height; + break; + } } } @@ -590,10 +766,10 @@ inline void logRectInfo( { using namespace nbl::system; using namespace nbl::hlsl; - const float width = length(compressed.right); - const float height = length(compressed.up); + const float width = length(compressed.right); + const float height = length(compressed.up); const float32_t3 normal = normalize(cross(compressed.right, compressed.up)); - const float dist = length(compressed.origin - observer); + const float dist = length(compressed.origin - observer); logger->log(" origin=%s right=%s up=%s observer=%s", ILogger::ELL_ERROR, to_string(compressed.origin).c_str(), @@ -617,14 +793,14 @@ inline bool anyRectCornerAboveHorizon( const nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; - const float32_t3 r0 = mul(shape.basis, shape.origin - observer); + const float32_t3 r0 = mul(shape.basis, shape.origin - observer); const float32_t3 localN = mul(shape.basis, normal); - const float32_t3 v0 = normalize(r0); - const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); - const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); - const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); + const float32_t3 v0 = normalize(r0); + const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); + const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); + const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); return dot(localN, v0) > 0.0f || dot(localN, v1) > 0.0f || - dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f; + dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f; } // True if all rectangle corners have positive NdotL with the given normal. @@ -635,14 +811,14 @@ inline bool allRectCornersAboveHorizon( const nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; - const float32_t3 r0 = mul(shape.basis, shape.origin - observer); + const float32_t3 r0 = mul(shape.basis, shape.origin - observer); const float32_t3 localN = mul(shape.basis, normal); - const float32_t3 v0 = normalize(r0); - const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); - const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); - const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); + const float32_t3 v0 = normalize(r0); + const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); + const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); + const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); return dot(localN, v0) > 0.0f && dot(localN, v1) > 0.0f && - dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f; + dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f; } #endif diff --git a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h index cb28b63fc..b20ba88f9 100644 --- a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h +++ b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h @@ -78,7 +78,9 @@ class CSamplerPropertyTester } public: - CSamplerPropertyTester(system::ILogger* logger) : m_logger(logger) {} + CSamplerPropertyTester(system::ILogger* logger, std::optional seedOverride = {}) : m_logger(logger), m_seedOverride(seedOverride) {} + + std::optional failureSeed() const { return m_failureSeed; } bool run() { @@ -96,7 +98,7 @@ class CSamplerPropertyTester // If the PDF normalization is wrong by factor k, this will be off by 1/k. bool testMonteCarloPdfNormalization() { - SeededTestContext ctx; + SeededTestContext ctx(m_seedOverride); uint32_t evaluatedConfigs = 0; for (uint32_t c = 0; c < Config::numConfigurations; c++) @@ -159,7 +161,10 @@ class CSamplerPropertyTester m_logger->log(" [%s] MC normalization FAILED (%u/%u evaluated configs failed, %u/%u configs evaluated, %u samples/config, relTol=%e)", system::ILogger::ELL_ERROR, Config::name(), ctx.failCount, evaluatedConfigs, evaluatedConfigs, Config::numConfigurations, Config::samplesPerConfig, Config::mcNormalizationRelTol); - return ctx.finalize(m_logger, Config::name()); + const bool passed = ctx.finalize(m_logger, Config::name()); + if (!passed) + m_failureSeed = ctx.seed; + return passed; } // Test 4: Grid integration of backwardPdf over [0,1]^d codomain @@ -167,7 +172,7 @@ class CSamplerPropertyTester // integral of backwardPdf over codomain should equal 1.0. bool testGridPdfNormalization() { - SeededTestContext ctx; + SeededTestContext ctx(m_seedOverride); for (uint32_t c = 0; c < Config::numConfigurations; c++) { @@ -191,10 +196,15 @@ class CSamplerPropertyTester m_logger->log(" [%s] grid PDF normalization FAILED (%u/%u configs exceeded absTol=%e)", system::ILogger::ELL_ERROR, Config::name(), ctx.failCount, Config::numConfigurations, Config::gridNormalizationAbsTol); - return ctx.finalize(m_logger, Config::name()); + const bool passed = ctx.finalize(m_logger, Config::name()); + if (!passed) + m_failureSeed = ctx.seed; + return passed; } system::ILogger* m_logger; + std::optional m_seedOverride; + std::optional m_failureSeed; }; @@ -414,6 +424,12 @@ class CSphericalTriangleGenerateTester auto sampler = sampling::SphericalTriangle::create(shape); const float64_t SA = static_cast(shape.solid_angle); + // Float32 solid angle (acos sum - pi) loses precision for small + // triangles due to catastrophic cancellation, making the expected + // sub-solid-angle ratio unreliable as a reference value. + // At SA ~ 0.003, the relative error in float32 solid angles reaches + // ~1-3%, comparable to the half-space counting tolerance. + const bool tinyTriangle = SA < 4e-3; // For each cut: pick a vertex and a point on the opposite edge, // forming a great circle that splits the triangle in two. @@ -482,12 +498,20 @@ class CSphericalTriangleGenerateTester testedCuts++; if (absErr > relTol) { - ctx.failCount++; - if (ctx.failCount <= 5) + if (tinyTriangle) { - m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u", - system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c); - logTriangleInfo(m_logger, v0, v1, v2); + m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u -- solid angle %e too small for float32, especially on GPU", + system::ILogger::ELL_WARNING, label, observedFraction, expectedFraction, absErr, relTol, t, c, SA); + } + else + { + ctx.failCount++; + if (ctx.failCount <= 5) + { + m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u", + system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c); + logTriangleInfo(m_logger, v0, v1, v2); + } } } } @@ -504,12 +528,20 @@ class CSphericalTriangleGenerateTester } // ------------------------------------------------------------------------- - // Moment matching: E[dot(generate(u), N)] should equal PSA(N) / SA. + // Moment matching: E[dot(generate(u), N)] should equal signedPSA(N) / SA. // // For a uniform distribution over a spherical triangle: // E[f(L)] = (1/SA) * integral_triangle f(L) dw // - // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = PSA(N) / SA. + // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = signedPSA(N) / SA, + // where signedPSA is the exact signed projected solid angle computed + // via the Kelvin-Stokes theorem: + // signedPSA(N) = 0.5 * sum_edges dot(edgeNormal_i, N) * edgeArcLength_i + // + // Note: shapes::SphericalTriangle::projectedSolidAngle() returns a signed result + // (Kelvin-Stokes signed sum); tests abs() the return to compare against the + // |cos(theta)| (BSDF) PSA integral reference. + // // If generate() has a systematic bias (e.g., concentrating samples // near one vertex), this moment will be wrong for most directions N. // Testing multiple random N per triangle makes it very unlikely that @@ -533,11 +565,34 @@ class CSphericalTriangleGenerateTester auto sampler = sampling::SphericalTriangle::create(shape); const float64_t SA = static_cast(shape.solid_angle); + // Precompute edge normals and arc lengths for the signed PSA formula. + // cross(v_j, v_k) * csc_sides[i] gives outward-pointing edge normals + // only when the vertices are CCW as seen from outside the sphere. + // The sign of the triple product dot(v0, cross(v1, v2)) tells us the + // winding: positive = CCW (outward normals), negative = CW (inward). + const float32_t3 crossBC = hlsl::cross(shape.vertices[1], shape.vertices[2]); + const float64_t windingSign = (hlsl::dot(shape.vertices[0], crossBC) >= 0.0f) ? 1.0 : -1.0; + const float32_t3 edgeNormals[3] = { + crossBC * shape.csc_sides[0], + hlsl::cross(shape.vertices[2], shape.vertices[0]) * shape.csc_sides[1], + hlsl::cross(shape.vertices[0], shape.vertices[1]) * shape.csc_sides[2] + }; + const float64_t edgeAngles[3] = { + std::acos(static_cast(hlsl::clamp(shape.cos_sides[0], -1.0f, 1.0f))), + std::acos(static_cast(hlsl::clamp(shape.cos_sides[1], -1.0f, 1.0f))), + std::acos(static_cast(hlsl::clamp(shape.cos_sides[2], -1.0f, 1.0f))) + }; + for (uint32_t n = 0; n < numNormals; n++) { float32_t3 N = generateRandomUnitVector(ctx.rng); - const float64_t psa = static_cast(shape.projectedSolidAngle(N)); - const float64_t expected = psa / SA; + + // Signed PSA via Kelvin-Stokes: exact for integral dot(L,N) dOmega + float64_t signedPSA = 0.0; + for (uint32_t e = 0; e < 3; e++) + signedPSA += static_cast(hlsl::dot(edgeNormals[e], N)) * edgeAngles[e]; + signedPSA *= 0.5 * windingSign; + const float64_t expected = signedPSA / SA; float64_t sum = 0.0; std::uniform_real_distribution uDist(0.0f, 1.0f); @@ -546,7 +601,7 @@ class CSphericalTriangleGenerateTester float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); typename sampling::SphericalTriangle::cache_type cache; float32_t3 L = sampler.generate(u, cache); - sum += static_cast(hlsl::abs(dot(L, N))); + sum += static_cast(dot(L, N)); } const float64_t mcEstimate = sum / static_cast(numSamples); @@ -601,7 +656,7 @@ class CSphericalTriangleGenerateTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) continue; - auto sampler = sampling::SphericalTriangle::create(shape); + auto sampler = sampling::SphericalTriangle::create(shape); std::uniform_real_distribution uDist(0.0f, 1.0f); for (uint32_t i = 0; i < samplesPerTriangle; i++) @@ -742,7 +797,7 @@ class CSphericalTriangleGenerateTester // Tests two aspects of projected spherical triangles: // // 1. PSA formula accuracy: shapes::SphericalTriangle::projectedSolidAngle -// against Monte Carlo ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega). +// against grid-integration ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega). // // 2. PST sampler accuracy: how well ProjectedSphericalTriangle's bilinear // importance sampling approximates the true NdotL distribution, and @@ -767,18 +822,21 @@ class CProjectedSphericalTriangleGeometricTester // when edge normals have mixed signs, even when all vertices are above the horizon. // These tests are diagnostic-only until proper hemisphere clipping is implemented. // TODO: make these hard failures once projectedSolidAngle clips to the hemisphere. - testPSAVersusMonteCarlo("random MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) + // Hard-fail thresholds: relErr > 3.0 AND absErr > 0.3 means the formula is catastrophically + // wrong, not just affected by the known abs()-overcount limitation. Catches regressions that + // would otherwise hide in the warning stream. + pass &= testPSAVersusGrid("random", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) { generateRandomTriangleVertices(rng, v0, v1, v2); - normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, true); - testPSAVersusMonteCarlo("grazing MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) + normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, 3.0, 0.3, true); + pass &= testPSAVersusGrid("grazing", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) { generateRandomTriangleVertices(rng, v0, v1, v2); float32_t3 triCenter = normalize(v0 + v1 + v2); float32_t3 tangent, unused; buildTangentFrame(triCenter, tangent, unused); std::uniform_real_distribution grazeDist(0.02f, 0.15f); - normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, true); + normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, 3.0, 0.3, true); // Also diagnostic -- same abs() issue affects small triangles testPSASmallTriangle(); @@ -860,7 +918,7 @@ class CProjectedSphericalTriangleGeometricTester // Known analytic cases bool testPSAKnownCases() { - constexpr float64_t psaOctantMCRelTol = 0.05; + constexpr float64_t psaOctantGridRelTol = 0.05; constexpr float64_t psaSymmetryRelTol = 1e-4; SeededTestContext ctx; @@ -872,51 +930,52 @@ class CProjectedSphericalTriangleGeometricTester // By Kelvin-Stokes / direct integration, PSA = pi/4 for any axis-aligned normal. { auto shape = createSphericalTriangleShape(float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)); - const float64_t psaZ = static_cast(shape.projectedSolidAngle(float32_t3(0, 0, 1))); + const float64_t psaZ = std::abs(static_cast(shape.projectedSolidAngle(float32_t3(0, 0, 1)))); - // MC verification: sample many points uniformly from the octant triangle - const float64_t mcPSA = mcEstimatePSA(shape, float32_t3(0, 0, 1), 1000000, ctx.rng); + // Grid verification: evaluate abs(N.L) over a dense grid on the octant triangle + const float64_t gridPSA = gridEstimatePSA(shape, float32_t3(0, 0, 1), 1000000); - const float64_t formulaVsMC = std::abs(psaZ - mcPSA) / std::abs(mcPSA); - m_logger->log(" [PSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e", - system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi / 4.0, mcPSA, formulaVsMC); + const float64_t formulaVsGrid = std::abs(psaZ - gridPSA) / std::abs(gridPSA); + m_logger->log(" [TriPSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e", + system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi / 4.0, gridPSA, formulaVsGrid); - if (formulaVsMC > psaOctantMCRelTol) + if (formulaVsGrid > psaOctantGridRelTol) { - m_logger->log(" [PSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e", - system::ILogger::ELL_ERROR, psaZ, mcPSA, formulaVsMC, psaOctantMCRelTol); + m_logger->log(" [TriPSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e", + system::ILogger::ELL_ERROR, psaZ, gridPSA, formulaVsGrid, psaOctantGridRelTol); pass = false; } // Same octant, normal = (1,0,0): by symmetry same result as z-normal - const float64_t psaX = static_cast(shape.projectedSolidAngle(float32_t3(1, 0, 0))); + const float64_t psaX = std::abs(static_cast(shape.projectedSolidAngle(float32_t3(1, 0, 0)))); const float64_t relDiff = std::abs(psaZ - psaX) / std::max(psaZ, psaX); - m_logger->log(" [PSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e", + m_logger->log(" [TriPSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e", system::ILogger::ELL_PERFORMANCE, psaZ, psaX, relDiff); if (relDiff > psaSymmetryRelTol) { - m_logger->log(" [PSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e", + m_logger->log(" [TriPSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e", system::ILogger::ELL_ERROR, psaZ, psaX, relDiff, psaSymmetryRelTol); pass = false; } } if (pass) - m_logger->log(" [PSA] known cases PASSED (octant z-normal vs MC relTol=%e, octant symmetry z vs x relTol=%e)", - system::ILogger::ELL_PERFORMANCE, psaOctantMCRelTol, psaSymmetryRelTol); + m_logger->log(" [TriPSA] known cases PASSED (octant z-normal vs grid relTol=%e, octant symmetry z vs x relTol=%e)", + system::ILogger::ELL_PERFORMANCE, psaOctantGridRelTol, psaSymmetryRelTol); - return ctx.finalize(pass, m_logger, "PSA"); + return ctx.finalize(pass, m_logger, "TriPSA"); } - // Helper: run MC comparison of formulaPSA vs E[dot(L,N)]*SA for a set of triangle configs. + // Helper: run grid-integration comparison of formulaPSA vs PSA reference for a set of triangle configs. // TriConfigGen: void(rng, index, v0, v1, v2, normal) — generates triangle vertices + normal. template - bool testPSAVersusMonteCarlo(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol, bool diagnostic = false) + bool testPSAVersusGrid(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t gridSamples, + float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol, bool diagnostic = false) { - return ::testPSAVersusMonteCarlo(m_logger, "PSA", label, - [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo) + return ::testPSAVersusGrid(m_logger, "TriPSA", label, + [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo) { float32_t3 v0, v1, v2, normal; triConfigGenerator(rng, c, v0, v1, v2, normal); @@ -925,8 +984,8 @@ class CProjectedSphericalTriangleGeometricTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) return; - formulaPSA = static_cast(shape.projectedSolidAngle(normal)); - mcPSA = mcEstimatePSA(shape, normal, mcSamples, rng); + formulaPSA = std::abs(static_cast(shape.projectedSolidAngle(normal))); + gridPSA = gridEstimatePSA(shape, normal, gridSamples); logInfo = [=](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level) { using nbl::system::to_string; @@ -935,14 +994,14 @@ class CProjectedSphericalTriangleGeometricTester to_string(normal).c_str(), to_string(shape.solid_angle).c_str()); }; }, - numConfigs, relTol, absTol, diagnostic); + numConfigs, relTol, absTol, hardRelTol, hardAbsTol, diagnostic); } - // Small triangles -- PSA should approach MC ground truth + // Small triangles -- PSA should approach grid ground truth bool testPSASmallTriangle() { constexpr float64_t smallTriMeanRelErrTol = 0.1; - constexpr uint32_t smallTriMCSamples = 100000; + constexpr uint32_t smallTriGridSamples = 100000; SeededTestContext ctx; bool pass = true; @@ -973,27 +1032,27 @@ class CProjectedSphericalTriangleGeometricTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) continue; - const float64_t formulaPSA = static_cast(shape.projectedSolidAngle(normal)); + const float64_t formulaPSA = std::abs(static_cast(shape.projectedSolidAngle(normal))); const float64_t sa = static_cast(shape.solid_angle); const float64_t centerNdotL = static_cast(dot(normal, baseDir)); if (std::abs(centerNdotL) < 0.1 || sa < 1e-10) continue; - // MC ground truth: E[abs(dot(L, N))] * solidAngle - const float64_t mcPSA = mcEstimatePSA(shape, normal, smallTriMCSamples, ctx.rng); + // Grid ground truth: mean over regular [0,1]^2 grid of abs(dot(L, N)) * solidAngle + const float64_t gridPSA = gridEstimatePSA(shape, normal, smallTriGridSamples); - if (std::abs(mcPSA) < 1e-10) + if (std::abs(gridPSA) < 1e-10) continue; - const float64_t relErr = (formulaPSA - mcPSA) / mcPSA; + const float64_t relErr = (formulaPSA - gridPSA) / gridPSA; sumRelErrPerSize[s] += relErr; validTrials[s]++; } } - m_logger->log(" [PSA] small triangle PSA vs MC (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE); + m_logger->log(" [TriPSA] small triangle PSA vs grid (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE); for (uint32_t s = 0; s < numSizes; s++) { if (validTrials[s] > 0) @@ -1005,14 +1064,14 @@ class CProjectedSphericalTriangleGeometricTester // Skip halfAngle=0.01 (s==5): float32 solid angle precision collapses if (s == 4 && std::abs(meanRelErr) > smallTriMeanRelErrTol) { - m_logger->log(" [PSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)", + m_logger->log(" [TriPSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)", system::ILogger::ELL_WARNING, halfAngles[s], meanRelErr, smallTriMeanRelErrTol, validTrials[s]); } } } - m_logger->log(" [PSA] small triangle test complete (%u trials across %u sizes, %u MC samples each, meanRelErrTol=%e) -- diagnostic only", - system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriMCSamples, smallTriMeanRelErrTol); + m_logger->log(" [TriPSA] small triangle test complete (%u trials across %u sizes, %u grid samples each, meanRelErrTol=%e) -- diagnostic only", + system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriGridSamples, smallTriMeanRelErrTol); return true; // diagnostic only -- abs()-based PSA overestimates, not a hard failure } @@ -1076,7 +1135,7 @@ class CProjectedSphericalTriangleGeometricTester if (!std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f) continue; - const float64_t projSA = static_cast(shape.projectedSolidAngle(cfg.normal)); + const float64_t projSA = std::abs(static_cast(shape.projectedSolidAngle(cfg.normal))); const bool hasPSA = projSA > 0.0 && std::isfinite(projSA); const float64_t rcpPSA = hasPSA ? 1.0 / projSA : 0.0; MISStats& mis = isGrazing ? grazingMIS : normalMIS; @@ -1090,7 +1149,7 @@ class CProjectedSphericalTriangleGeometricTester float32_t3 L = sampler.generate(u, cache); const float64_t trueNdotL = std::max(0.0, static_cast(dot(cfg.normal, L))); - const float64_t bilinearNdotL = static_cast(cache.abs_cos_theta); + const float64_t bilinearNdotL = std::numeric_limits::quiet_NaN(); const float64_t pstPdf = static_cast(sampler.forwardPdf(u, cache)); // Bilinear vs true NdotL @@ -1323,7 +1382,7 @@ class CProjectedSphericalTriangleGeometricTester continue; auto sampler = createSampler(cfg); - const float64_t projSA = static_cast(shape.projectedSolidAngle(cfg.normal)); + const float64_t projSA = std::abs(static_cast(shape.projectedSolidAngle(cfg.normal))); if (projSA <= 0.0 || !std::isfinite(projSA) || !std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f) @@ -1344,7 +1403,11 @@ class CProjectedSphericalTriangleGeometricTester if (trueNdotL < 1e-6) continue; - const float64_t pstPdf = static_cast(sampler.backwardPdf(L)); + // No direct backwardPdf; evaluate forwardPdf at the inverted u to recover pdf(L). + const float32_t2 uInv = sampler.sphtri.generateInverse(L); + typename sampling::ProjectedSphericalTriangle::cache_type pdfCache; + sampler.generate(uInv, pdfCache); + const float64_t pstPdf = static_cast(sampler.forwardPdf(uInv, pdfCache)); const float64_t idealPdf = trueNdotL * rcpPSA; if (!std::isfinite(pstPdf) || pstPdf <= 0.0 || idealPdf <= 0.0) @@ -1416,6 +1479,15 @@ struct UniformRectSamplerPolicy return sampler_type::create(shape, observer); } + // Returns offset-from-r0 on the rectangle surface. Goes through generateLocalBasisXY + // (absolute xy) and subtracts r0.xy so the [0, extents] bounds check still applies. + static float32_t2 generateOffset(sampler_type& s, const float32_t2& u) + { + typename sampler_type::cache_type cache; + const float32_t2 absXY = s.generateLocalBasisXY(u, cache); + return absXY - float32_t2(s.r0.x, s.r0.y); + } + static float getSolidAngle(const sampler_type& s) { return s.solidAngle; } static const char* name() { return "SphericalRectangle"; } @@ -1425,7 +1497,8 @@ struct UniformRectSamplerPolicy struct ProjectedRectSamplerPolicy { - using sampler_type = sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for diagnostic logs. + using sampler_type = sampling::ProjectedSphericalRectangle; static sampler_type createSampler(shapes::SphericalRectangle& shape, const float32_t3& observer, std::mt19937& rng) @@ -1439,6 +1512,17 @@ struct ProjectedRectSamplerPolicy return sampler_type::create(shape, observer, receiverNormal, false); } + // Run u through the bilinear warp then the inner sphrect's generateLocalBasisXY, and subtract + // r0.xy to get offset-from-r0 on the rectangle surface. + static float32_t2 generateOffset(sampler_type& s, const float32_t2& u) + { + typename sampling::Bilinear::cache_type bc; + const float32_t2 warped = s.bilinearPatch.generate(u, bc); + typename sampling::SphericalRectangle::cache_type sphrectCache; + const float32_t2 absXY = s.sphrect.generateLocalBasisXY(warped, sphrectCache); + return absXY - float32_t2(s.sphrect.r0.x, s.sphrect.r0.y); + } + static float getSolidAngle(const sampler_type& s) { return s.sphrect.solidAngle; } static const char* name() { return "ProjectedSphericalRectangle"; } @@ -1635,8 +1719,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); const float coord = cutAlongX ? gen.x : gen.y; if (coord < cutThreshold) countInSub++; @@ -1714,8 +1797,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); float32_t3 dir = reconstructDirection(compressed, shape.extents, observer, gen); sum += static_cast(dot(dir, N)); } @@ -1778,8 +1860,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); if (gen.x < -1e-5f || gen.x > extX + 1e-5f || gen.y < -1e-5f || gen.y > extY + 1e-5f) { @@ -1891,9 +1972,9 @@ using CProjectedSphericalRectangleGenerateTester = CRectangleGenerateTester 3.0 AND absErr > 0.3) still catch catastrophic regressions. + bool pass = true; + pass &= testPSAVersusGrid("random", generateRandomRectangle, 200, 500000, 0.05, 0.01, 3.0, 0.3); + pass &= testPSAVersusGrid("grazing", generateStressRectangle, 200, 500000, 0.1, 0.01, 3.0, 0.3); + return pass; } private: // Reuse rectangle generators from CRectangleGenerateTester using RectGen = void(*)(std::mt19937&, shapes::CompressedSphericalRectangle&, float32_t3&); - bool testPSAVersusMonteCarlo(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol) + bool testPSAVersusGrid(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t gridSamples, + float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol) { - return ::testPSAVersusMonteCarlo(m_logger, "RectPSA", label, - [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo) + return ::testPSAVersusGrid(m_logger, "RectPSA", label, + [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo) { shapes::CompressedSphericalRectangle compressed; float32_t3 observer; @@ -1932,7 +2016,9 @@ class CProjectedSphericalRectangleGeometricTester float32_t3 normal = generateRandomUnitVector(rng); formulaPSA = static_cast(shape.projectedSolidAngle(observer, normal)); - mcPSA = mcEstimatePSA(shape, observer, normal, mcSamples, rng); + // surfaceGridEstimatePSA integrates over the rectangle surface directly (no sampler in + // the loop), so a formula-vs-reference mismatch here isolates the PSA formula. + gridPSA = surfaceGridEstimatePSA(shape, observer, normal, gridSamples); logInfo = [compressed, observer, normal, saValue = sa.value](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level) { using nbl::system::to_string; @@ -1945,7 +2031,7 @@ class CProjectedSphericalRectangleGeometricTester to_string(saValue).c_str()); }; }, - numConfigs, relTol, absTol, true); + numConfigs, relTol, absTol, hardRelTol, hardAbsTol, true); } system::ILogger* m_logger; diff --git a/64_EmulatedFloatTest/main.cpp b/64_EmulatedFloatTest/main.cpp index 7919f68c5..549596bac 100644 --- a/64_EmulatedFloatTest/main.cpp +++ b/64_EmulatedFloatTest/main.cpp @@ -6,6 +6,8 @@ #include "nbl/examples/examples.hpp" #include +#include +#include #include #include #include @@ -17,6 +19,8 @@ #include +#include "nbl/examples/Benchmark/IBenchmark.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" using namespace nbl::core; using namespace nbl::hlsl; @@ -26,1195 +30,1031 @@ using namespace nbl::video; using namespace nbl::application_templates; using namespace nbl::examples; -constexpr bool DoTests = true; +constexpr bool DoTests = true; constexpr bool DoBenchmark = true; +// One row per EF64_BENCHMARK_MODE. Each instance owns its own write-sink +// buffer + descriptor set; the framework's GPUBenchmarkHelper handles +// cmdbuf / queryPool / pipeline-stats capture / runTimed timing, IBenchmark +// routes the result through the Aggregator. The shader binds an SSBO at +// set 0 / binding 0, so we pass an explicit dsLayout to createPipeline. +class CEF64Benchmark : public GPUBenchmark +{ + public: + static constexpr const char* kSectionLabel = "EF64 Benchmarks"; + + struct SetupData + { + smart_refctd_ptr assetMgr; + core::vector name; // hierarchical row name + EF64_BENCHMARK_MODE mode; // pushed each run() via PC + GPUBenchmarkHelper::ShaderVariant variant; // precompiled "benchmark" SPIRV + uint32_t warmupDispatches; + uint64_t targetBudgetMs; + }; + + // Shape is fixed by the BENCHMARK_WORKGROUP_* macros; expose it so the + // caller uses the same shape both to construct the bench and to build the + // RunContext for its span. + static WorkloadShape shape() + { + const hlsl::uint32_t3 wg = { + BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y, + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z}; + const hlsl::uint32_t3 dgc = {BENCHMARK_WORKGROUP_COUNT, 1u, 1u}; + // Shader writes one float64 per thread per dispatch; "sample" == "thread output". + const uint64_t samplesPerDispatch = uint64_t(dgc.x) * dgc.y * dgc.z * wg.x * wg.y * wg.z; + return {.workgroupSize = wg, .dispatchGroupCount = dgc, .samplesPerDispatch = samplesPerDispatch}; + } + + CEF64Benchmark(Aggregator& aggregator, const SetupData& data) + : GPUBenchmark(aggregator, GPUBenchmark::SetupData{ + .name = data.name, + .warmupDispatches = data.warmupDispatches, + .shape = shape(), + .targetBudgetMs = data.targetBudgetMs, + }) + , m_mode(data.mode) + { + // Buffer the shader writes to (descriptor-bound; not BDA). Sized for one + // float64 per thread; the GPU never reads it back to host. + m_buffer = createOutputBuffer(getShape().samplesPerDispatch * sizeof(float64_t)); + + // One SSBO at set 0 / binding 0. createSingleBindingDS wires the + // layout + pool + DS + write descriptor in one call. + auto ds = createSingleBindingDS(m_buffer); + m_dsLayout = std::move(ds.layout); + m_ds = std::move(ds.set); + m_pipelineIdx = createPipeline(data.variant, data.assetMgr, sizeof(BenchmarkPushConstants), joinName(data.name), m_dsLayout); + } + + void doRun() override + { + const PipelineEntry* pe = getPipelineEntry(m_pipelineIdx, joinName(m_name)); + if (!pe) + return; + BenchmarkPushConstants pc = {}; + pc.benchmarkMode = m_mode; + + const TimingResult t = runTimedBudgeted(getWarmupDispatches(), getTargetBudgetMs(), + [&](IGPUCommandBuffer* cb) + { + cb->bindDescriptorSets(EPBP_COMPUTE, pe->layout.get(), 0, 1, &m_ds.get()); + defaultBindAndPush(cb, *pe, pc); + }, + [this](IGPUCommandBuffer* cb) { defaultDispatch(cb); }, + samplesForCurrentRow()); + + record(m_name, t, pe->stats); + } + + private: + EF64_BENCHMARK_MODE m_mode = EF64_BENCHMARK_MODE::NATIVE; + smart_refctd_ptr m_buffer; + smart_refctd_ptr m_dsLayout; + smart_refctd_ptr m_ds; + uint32_t m_pipelineIdx = 0; +}; + class CompatibilityTest final : public MonoDeviceApplication, public BuiltinResourcesApplication { - using device_base_t = MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; -public: - CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - - virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override - { - auto retval = device_base_t::getPreferredDeviceFeatures(); - retval.pipelineExecutableInfo = true; - return retval; - } - - bool onAppInitialized(smart_refctd_ptr&& system) override - { - // since emulated_float64_t rounds to zero - std::fesetround(FE_TOWARDZERO); - - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; - - return true; - } - - void onAppTerminated_impl() override - { - m_device->waitIdle(); - } - - void workLoopBody() override - { - if constexpr (DoTests) - { - emulated_float64_tests(); - } - if constexpr (DoBenchmark) - { - EF64Benchmark benchmark(*this); - benchmark.run(); - } - - m_keepRunning = false; - } - - bool keepRunning() override - { - return m_keepRunning; - } - - -private: - - bool m_keepRunning = true; - - constexpr static inline uint32_t EmulatedFloat64TestIterations = 1000u; - - enum class EmulatedFloatTestDevice - { - CPU, - GPU - }; - - template - bool compareEmulatedFloat64TestValues(const TestValues& expectedValues, const TestValues& testValues) - { - bool success = true; - - auto printOnFailure = [this](EmulatedFloatTestDevice device) - { - std::string errorMsgPrefix = ""; - if (device == EmulatedFloatTestDevice::CPU) - errorMsgPrefix = "CPU test fail:"; - else - errorMsgPrefix = "GPU test fail:"; - - m_logger->log("%s", ILogger::ELL_ERROR, errorMsgPrefix.c_str()); - m_logFile << errorMsgPrefix << '\n'; - }; - - auto printOnArithmeticFailure = [this](const char* valName, uint64_t expectedValue, uint64_t testValue, uint64_t a, uint64_t b) - { - double expectedAsDouble = reinterpret_cast(expectedValue); - double testAsDouble = reinterpret_cast(testValue); - double error = std::abs(expectedAsDouble - testAsDouble); - - std::stringstream ss; - ss << "for input values: A = " << reinterpret_cast(a) << " B = " << reinterpret_cast(b) << '\n'; - ss << valName << " not equal!"; - ss << "\nexpected value: " << std::fixed << std::setprecision(20) << expectedAsDouble; - ss << "\ntest value: " << std::fixed << std::setprecision(20) << testAsDouble; - ss << "\nerror = " << error << '\n'; - ss << "bit representations: \n"; - ss << "seeeeeeeeeeemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n"; - ss << std::bitset<64>(expectedValue) << " - expectedValue bit pattern\n"; - ss << std::bitset<64>(testValue) << " - testValue bit pattern \n"; - - m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); - m_logFile << ss.str() << '\n'; - - //std::cout << "ULP error: " << std::max(expectedValue, testValue) - std::min(expectedValue, testValue) << "\n\n"; - - }; - - auto calcULPError = [](emulated_float64_t::storage_t expectedValue, emulated_float64_t::storage_t testValue) - { - return std::max(expectedValue, testValue) - std::min(expectedValue, testValue); - }; - - auto printOnComparisonFailure = [this](const char* valName, int expectedValue, int testValue, double a, double b) - { - std::string inputValuesStr = std::string("for input values: A = ") + std::to_string(a) + std::string(" B = ") + std::to_string(b); - - m_logger->log("%s", ILogger::ELL_ERROR, inputValuesStr.c_str()); - m_logFile << inputValuesStr << '\n'; - - std::stringstream ss; - ss << valName << " not equal!"; - ss << "\nexpected value: " << std::boolalpha << bool(expectedValue); - ss << "\ntest value: " << std::boolalpha << bool(testValue); - - m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); - m_logFile << ss.str() << '\n'; - }; - - if (calcULPError(expectedValues.int32CreateVal, testValues.int32CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("int32CreateVal", expectedValues.int32CreateVal, testValues.int32CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.int64CreateVal, testValues.int64CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("int64CreateVal", expectedValues.int64CreateVal, testValues.int64CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.uint32CreateVal, testValues.uint32CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("uint32CreateVal", expectedValues.uint32CreateVal, testValues.uint32CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.uint64CreateVal, testValues.uint64CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("uint64CreateVal", expectedValues.uint64CreateVal, testValues.uint64CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.float32CreateVal, testValues.float32CreateVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("float32CreateVal", expectedValues.float32CreateVal, testValues.float32CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.float64CreateVal != testValues.float64CreateVal) - { - printOnFailure(Device); - printOnArithmeticFailure("float64CreateVal", expectedValues.float64CreateVal, testValues.float64CreateVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.additionVal, testValues.additionVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("additionVal", expectedValues.additionVal, testValues.additionVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.substractionVal, testValues.substractionVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("substractionVal", expectedValues.substractionVal, testValues.substractionVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.multiplicationVal, testValues.multiplicationVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("multiplicationVal", expectedValues.multiplicationVal, testValues.multiplicationVal, expectedValues.a, expectedValues.b); - success = false; - } - if (calcULPError(expectedValues.divisionVal, testValues.divisionVal) > 1u) - { - printOnFailure(Device); - printOnArithmeticFailure("divisionVal", expectedValues.divisionVal, testValues.divisionVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.lessOrEqualVal != testValues.lessOrEqualVal) - { - printOnFailure(Device); - printOnComparisonFailure("lessOrEqualVal", expectedValues.lessOrEqualVal, testValues.lessOrEqualVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.greaterOrEqualVal != testValues.greaterOrEqualVal) - { - printOnFailure(Device); - printOnComparisonFailure("greaterOrEqualVal", expectedValues.greaterOrEqualVal, testValues.greaterOrEqualVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.equalVal != testValues.equalVal) - { - printOnFailure(Device); - printOnComparisonFailure("equalVal", expectedValues.equalVal, testValues.equalVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.notEqualVal != testValues.notEqualVal) - { - printOnFailure(Device); - printOnComparisonFailure("notEqualVal", expectedValues.notEqualVal, testValues.notEqualVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.lessVal != testValues.lessVal) - { - printOnFailure(Device); - printOnComparisonFailure("lessVal", expectedValues.lessVal, testValues.lessVal, expectedValues.a, expectedValues.b); - success = false; - } - if (expectedValues.greaterVal != testValues.greaterVal) - { - printOnFailure(Device); - printOnComparisonFailure("greaterVal", expectedValues.greaterVal, testValues.greaterVal, expectedValues.a, expectedValues.b); - success = false; - } - - return success; - }; - - class EF64Submitter - { - public: - EF64Submitter(CompatibilityTest& base) - :m_base(base), m_pushConstants({}), m_semaphoreCounter(0) - { - // setting up pipeline in the constructor - m_queueFamily = base.getComputeQueue()->getFamilyIndex(); - m_semaphore = base.m_device->createSemaphore(0); - m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) - base.logFail("Failed to create Command Buffers!\n"); - - // Load shaders, set up pipeline + using device_base_t = MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + CompatibilityTest(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.pipelineExecutableInfo = true; + return retval; + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // since emulated_float64_t rounds to zero + std::fesetround(FE_TOWARDZERO); + + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + return true; + } + + void onAppTerminated_impl() override + { + m_device->waitIdle(); + } + + void workLoopBody() override + { + if constexpr (DoTests) + { + emulated_float64_tests(); + } + if constexpr (DoBenchmark) + { + runEF64Benchmarks(); + } + + m_keepRunning = false; + } + + bool keepRunning() override + { + return m_keepRunning; + } + + + private: + bool m_keepRunning = true; + + constexpr static inline uint32_t EmulatedFloat64TestIterations = 1000u; + + enum class EmulatedFloatTestDevice + { + CPU, + GPU + }; + + template + bool compareEmulatedFloat64TestValues(const TestValues& expectedValues, const TestValues& testValues) + { + bool success = true; + + auto printOnFailure = [this](EmulatedFloatTestDevice device) + { + std::string errorMsgPrefix = ""; + if (device == EmulatedFloatTestDevice::CPU) + errorMsgPrefix = "CPU test fail:"; + else + errorMsgPrefix = "GPU test fail:"; + + m_logger->log("%s", ILogger::ELL_ERROR, errorMsgPrefix.c_str()); + m_logFile << errorMsgPrefix << '\n'; + }; + + auto printOnArithmeticFailure = [this](const char* valName, uint64_t expectedValue, uint64_t testValue, uint64_t a, uint64_t b) + { + double expectedAsDouble = reinterpret_cast(expectedValue); + double testAsDouble = reinterpret_cast(testValue); + double error = std::abs(expectedAsDouble - testAsDouble); + + std::stringstream ss; + ss << "for input values: A = " << reinterpret_cast(a) << " B = " << reinterpret_cast(b) << '\n'; + ss << valName << " not equal!"; + ss << "\nexpected value: " << std::fixed << std::setprecision(20) << expectedAsDouble; + ss << "\ntest value: " << std::fixed << std::setprecision(20) << testAsDouble; + ss << "\nerror = " << error << '\n'; + ss << "bit representations: \n"; + ss << "seeeeeeeeeeemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n"; + ss << std::bitset<64>(expectedValue) << " - expectedValue bit pattern\n"; + ss << std::bitset<64>(testValue) << " - testValue bit pattern \n"; + + m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); + m_logFile << ss.str() << '\n'; + + //std::cout << "ULP error: " << std::max(expectedValue, testValue) - std::min(expectedValue, testValue) << "\n\n"; + }; + + auto calcULPError = [](emulated_float64_t::storage_t expectedValue, emulated_float64_t::storage_t testValue) + { + return std::max(expectedValue, testValue) - std::min(expectedValue, testValue); + }; + + auto printOnComparisonFailure = [this](const char* valName, int expectedValue, int testValue, double a, double b) + { + std::string inputValuesStr = std::string("for input values: A = ") + std::to_string(a) + std::string(" B = ") + std::to_string(b); + + m_logger->log("%s", ILogger::ELL_ERROR, inputValuesStr.c_str()); + m_logFile << inputValuesStr << '\n'; + + std::stringstream ss; + ss << valName << " not equal!"; + ss << "\nexpected value: " << std::boolalpha << bool(expectedValue); + ss << "\ntest value: " << std::boolalpha << bool(testValue); + + m_logger->log("%s", ILogger::ELL_ERROR, ss.str().c_str()); + m_logFile << ss.str() << '\n'; + }; + + if (calcULPError(expectedValues.int32CreateVal, testValues.int32CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("int32CreateVal", expectedValues.int32CreateVal, testValues.int32CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.int64CreateVal, testValues.int64CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("int64CreateVal", expectedValues.int64CreateVal, testValues.int64CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.uint32CreateVal, testValues.uint32CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("uint32CreateVal", expectedValues.uint32CreateVal, testValues.uint32CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.uint64CreateVal, testValues.uint64CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("uint64CreateVal", expectedValues.uint64CreateVal, testValues.uint64CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.float32CreateVal, testValues.float32CreateVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("float32CreateVal", expectedValues.float32CreateVal, testValues.float32CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.float64CreateVal != testValues.float64CreateVal) + { + printOnFailure(Device); + printOnArithmeticFailure("float64CreateVal", expectedValues.float64CreateVal, testValues.float64CreateVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.additionVal, testValues.additionVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("additionVal", expectedValues.additionVal, testValues.additionVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.substractionVal, testValues.substractionVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("substractionVal", expectedValues.substractionVal, testValues.substractionVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.multiplicationVal, testValues.multiplicationVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("multiplicationVal", expectedValues.multiplicationVal, testValues.multiplicationVal, expectedValues.a, expectedValues.b); + success = false; + } + if (calcULPError(expectedValues.divisionVal, testValues.divisionVal) > 1u) + { + printOnFailure(Device); + printOnArithmeticFailure("divisionVal", expectedValues.divisionVal, testValues.divisionVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.lessOrEqualVal != testValues.lessOrEqualVal) + { + printOnFailure(Device); + printOnComparisonFailure("lessOrEqualVal", expectedValues.lessOrEqualVal, testValues.lessOrEqualVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.greaterOrEqualVal != testValues.greaterOrEqualVal) + { + printOnFailure(Device); + printOnComparisonFailure("greaterOrEqualVal", expectedValues.greaterOrEqualVal, testValues.greaterOrEqualVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.equalVal != testValues.equalVal) + { + printOnFailure(Device); + printOnComparisonFailure("equalVal", expectedValues.equalVal, testValues.equalVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.notEqualVal != testValues.notEqualVal) + { + printOnFailure(Device); + printOnComparisonFailure("notEqualVal", expectedValues.notEqualVal, testValues.notEqualVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.lessVal != testValues.lessVal) + { + printOnFailure(Device); + printOnComparisonFailure("lessVal", expectedValues.lessVal, testValues.lessVal, expectedValues.a, expectedValues.b); + success = false; + } + if (expectedValues.greaterVal != testValues.greaterVal) + { + printOnFailure(Device); + printOnComparisonFailure("greaterVal", expectedValues.greaterVal, testValues.greaterVal, expectedValues.a, expectedValues.b); + success = false; + } + + return success; + }; + + class EF64Submitter + { + public: + EF64Submitter(CompatibilityTest& base) + : m_base(base), m_pushConstants({}), m_semaphoreCounter(0) + { + // setting up pipeline in the constructor + m_queueFamily = base.getComputeQueue()->getFamilyIndex(); + m_semaphore = base.m_device->createSemaphore(0); + m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + base.logFail("Failed to create Command Buffers!\n"); + + // Load shaders, set up pipeline + { + smart_refctd_ptr shader; { - smart_refctd_ptr shader; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = base.m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - - auto key = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get()); - auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - base.logFail("Could not load shader!"); - assert(0); - } - - // It would be super weird if loading a shader from a file produced more than 1 asset - assert(assets.size() == 1); - shader = IAsset::castDown(assets[0]); - } - - if (!shader) - base.logFail("Failed to load precompiled \"test\" shader!\n"); - - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - { - .binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1 - } - }; - smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); - if (!dsLayout) - base.logFail("Failed to create a Descriptor Layout!\n"); - - SPushConstantRange pushConstantRanges[] = { - { - .stageFlags = ShaderStage::ESS_COMPUTE, - .offset = 0, - .size = sizeof(PushConstants) - } - }; - m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); - if (!m_pplnLayout) - base.logFail("Failed to create a Pipeline Layout!\n"); - - { - IGPUComputePipeline::SCreationParams params = {}; - params.layout = m_pplnLayout.get(); - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!base.m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) - base.logFail("Failed to create pipelines (compile & link shaders)!\n"); - - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_pipeline->getExecutableInfo()); - base.m_logger->log("EF64Submitter Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str()); - } - } - - // Allocate the memory - { - constexpr size_t BufferSize = sizeof(TestValues); - - nbl::video::IGPUBuffer::SCreationParams params = {}; - params.size = BufferSize; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - smart_refctd_ptr outputBuff = base.m_device->createBuffer(std::move(params)); - if (!outputBuff) - base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - - outputBuff->setObjectDebugName("emulated_float64_t output buffer"); - - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); - reqs.memoryTypeBits &= base.m_physicalDevice->getHostVisibleMemoryTypeBits(); - - m_allocation = base.m_device->allocate(reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_allocation.isValid()) - base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(outputBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); - - m_ds = pool->createDescriptorSet(std::move(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo info[1]; - info[0].desc = smart_refctd_ptr(outputBuff); - info[0].info.buffer = { .offset = 0,.size = BufferSize }; - IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} - }; - base.m_device->updateDescriptorSets(writes, {}); - } - } - - if (!m_allocation.memory->map({ 0ull,m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ)) - base.logFail("Failed to map the Device Memory!\n"); + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = base.m_logger.get(); + lp.workingDirectory = "app_resources"; // virtual root + + auto key = nbl::this_example::builtin::build::get_spirv_key<"test">(base.m_device.get()); + auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + base.logFail("Could not load shader!"); + assert(0); + } + + // It would be super weird if loading a shader from a file produced more than 1 asset + assert(assets.size() == 1); + shader = IAsset::castDown(assets[0]); } - // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches - const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); - if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - base.m_device->invalidateMappedMemoryRanges(1, &memoryRange); - - assert(memoryRange.valid() && memoryRange.length >= sizeof(TestValues)); - - m_queue = m_base.m_device->getQueue(m_queueFamily, 0); - } - - ~EF64Submitter() - { - m_allocation.memory->unmap(); - } - - void setPushConstants(PushConstants& pc) - { - m_pushConstants = pc; - } - - TestValues submitGetGPUTestValues() - { - // record command buffer - m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); - m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->bindComputePipeline(m_pipeline.get()); - m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstants), &m_pushConstants); - m_cmdbuf->dispatch(WORKGROUP_SIZE, 1, 1); - m_cmdbuf->endDebugMarker(); - m_cmdbuf->end(); - - IQueue::SSubmitInfo submitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()}}; - submitInfos[0].commandBuffers = cmdbufs; - const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - submitInfos[0].signalSemaphores = signals; - - m_base.m_api->startCapture(); - m_queue->submit(submitInfos); - m_base.m_api->endCapture(); - - m_base.m_device->waitIdle(); - TestValues output; - std::memcpy(&output, static_cast*>(m_allocation.memory->getMappedPointer()), sizeof(TestValues)); - m_base.m_device->waitIdle(); - - return output; - } - - private: - uint32_t m_queueFamily; - nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; - smart_refctd_ptr m_cmdbuf = nullptr; - smart_refctd_ptr m_cmdpool = nullptr; - smart_refctd_ptr m_ds = nullptr; - smart_refctd_ptr m_pplnLayout = nullptr; - PushConstants m_pushConstants; - CompatibilityTest& m_base; - smart_refctd_ptr m_pipeline; - smart_refctd_ptr m_semaphore; - IQueue* m_queue; - uint64_t m_semaphoreCounter; - }; - - void emulated_float64_tests() - { - EF64Submitter submitter(*this); - - auto printTestOutput = [this](const std::string& functionName, const EmulatedFloat64TestOutput& testResult) - { - std::cout << functionName << ": " << std::endl; - - if (!testResult.cpuTestsSucceed) - logFail("Incorrect CPU determinated values!"); - else - m_logger->log("Correct CPU determinated values!", ILogger::ELL_PERFORMANCE); - - if (!testResult.gpuTestsSucceed) - logFail("Incorrect GPU determinated values!"); - else - m_logger->log("Correct GPU determinated values!", ILogger::ELL_PERFORMANCE); - }; - - m_logFile.open("EmulatedFloatTestLog.txt", std::ios::out | std::ios::trunc); - if (!m_logFile.is_open()) - m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR); - - printTestOutput("emulatedFloat64RandomValuesTest", emulatedFloat64RandomValuesTest(submitter)); - printTestOutput("emulatedFloat64RandomValuesTestContrastingExponents", emulatedFloat64RandomValuesTestContrastingExponents(submitter)); - printTestOutput("emulatedFloat64NegAndPosZeroTest", emulatedFloat64NegAndPosZeroTest(submitter)); - printTestOutput("emulatedFloat64BothValuesInfTest", emulatedFloat64BothValuesInfTest(submitter)); - printTestOutput("emulatedFloat64BothValuesNegInfTest", emulatedFloat64BothValuesNegInfTest(submitter)); - printTestOutput("emulatedFloat64OneValIsInfOtherIsNegInfTest", emulatedFloat64OneValIsInfOtherIsNegInfTest(submitter)); - printTestOutput("emulatedFloat64OneValIsInfTest", emulatedFloat64OneValIsInfTest(submitter)); - printTestOutput("emulatedFloat64OneValIsNegInfTest", emulatedFloat64OneValIsNegInfTest(submitter)); - if(false) // doesn't work for some reason + fast math is enabled by default - printTestOutput("emulatedFloat64BNaNTest", emulatedFloat64BNaNTest(submitter)); - printTestOutput("emulatedFloat64BInfTest", emulatedFloat64OneValIsZeroTest(submitter)); - printTestOutput("emulatedFloat64BNegInfTest", emulatedFloat64OneValIsNegZeroTest(submitter)); - - m_logFile.close(); - } - - template - struct EmulatedFloat64TestValuesInfo - { - emulated_float64_t a; - emulated_float64_t b; - ConstructorTestValues constrTestValues; - TestValues expectedTestValues; - - void fillExpectedTestValues() - { - double aAsDouble = reinterpret_cast(a); - double bAsDouble = reinterpret_cast(b); - - expectedTestValues.a = a.data; - expectedTestValues.b = b.data; - - expectedTestValues.int32CreateVal = bit_cast(double(constrTestValues.int32)); - expectedTestValues.int64CreateVal = bit_cast(double(constrTestValues.int64)); - expectedTestValues.uint32CreateVal = bit_cast(double(constrTestValues.uint32)); - expectedTestValues.uint64CreateVal = bit_cast(double(constrTestValues.uint64)); - expectedTestValues.float32CreateVal = bit_cast(double(constrTestValues.float32)); - expectedTestValues.float64CreateVal = bit_cast(constrTestValues.float64); - expectedTestValues.additionVal = emulated_float64_t::create(aAsDouble + bAsDouble).data; - expectedTestValues.substractionVal = emulated_float64_t::create(aAsDouble - bAsDouble).data; - expectedTestValues.multiplicationVal = emulated_float64_t::create(aAsDouble * bAsDouble).data; - expectedTestValues.divisionVal = emulated_float64_t::create(aAsDouble / bAsDouble).data; - expectedTestValues.lessOrEqualVal = aAsDouble <= bAsDouble; - expectedTestValues.greaterOrEqualVal = aAsDouble >= bAsDouble; - expectedTestValues.equalVal = aAsDouble == bAsDouble; - expectedTestValues.notEqualVal = aAsDouble != bAsDouble; - expectedTestValues.lessVal = aAsDouble < bAsDouble; - expectedTestValues.greaterVal = aAsDouble > bAsDouble; - } - }; - - struct EmulatedFloat64TestOutput - { - bool cpuTestsSucceed; - bool gpuTestsSucceed; - }; - - EmulatedFloat64TestOutput emulatedFloat64LoopedTests_impl(EF64Submitter& submitter, - const uint32_t iterations, - const std::function& determineValueA, - const std::function& determineValueB) - { - EmulatedFloat64TestOutput output = { true, true }; - - std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_real_distribution fDistribution(-100000.0, 100000.0); - - std::random_device rd; - std::mt19937 mt(rd()); - - for (uint32_t i = 0u; i < iterations; ++i) - { - // generate random test values - EmulatedFloat64TestValuesInfo testValInfo; - double aTmp = determineValueA(); - double bTmp = determineValueB(); - testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); - testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); - testValInfo.constrTestValues.int32 = i32Distribution(mt); - testValInfo.constrTestValues.int64 = i64Distribution(mt); - testValInfo.constrTestValues.uint32 = u32Distribution(mt); - testValInfo.constrTestValues.uint64 = u64Distribution(mt); - testValInfo.constrTestValues.float32 = fDistribution(mt); - testValInfo.constrTestValues.float64 = fDistribution(mt); - - testValInfo.fillExpectedTestValues(); - auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - if (!singleTestOutput.cpuTestsSucceed) - output.cpuTestsSucceed = false; - if (!singleTestOutput.gpuTestsSucceed) - output.gpuTestsSucceed = false; - } - - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64RandomValuesTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - - return distribution(mt); - }; - - return emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations, getRandomFloat64, getRandomFloat64); - } - - EmulatedFloat64TestOutput emulatedFloat64RandomValuesTestContrastingExponents(EF64Submitter& submitter) - { - auto getRandomSmallFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-0.01, 0.01); - - return distribution(mt); - }; - - auto getRandomLargeFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(1000000000.0, 2000000000.0); - static std::uniform_int_distribution coinFlipDistribution(0, 1); - - double output = distribution(mt); - if (coinFlipDistribution(mt)) - output = -output; - - return output; - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomSmallFloat64, getRandomLargeFloat64); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomLargeFloat64, getRandomSmallFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64BothValuesNaNTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float32_t nan32 = std::numeric_limits::quiet_NaN(); - const float64_t nan64 = std::numeric_limits::quiet_NaN(); - testValInfo.a = emulated_float64_t::create(nan64); - testValInfo.b = emulated_float64_t::create(nan64); - testValInfo.constrTestValues = { - .int32 = std::bit_cast(nan32), - .int64 = std::bit_cast(nan64), - .uint32 = std::bit_cast(nan32), - .uint64 = std::bit_cast(nan64), - .float32 = nan32 - //.float64 = nan64 - }; - - testValInfo.fillExpectedTestValues(); - return performEmulatedFloat64Tests(testValInfo, submitter); - } - - EmulatedFloat64TestOutput emulatedFloat64NegAndPosZeroTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - testValInfo.a = emulated_float64_t::create(ieee754::traits::signMask); - testValInfo.b = emulated_float64_t::create(std::bit_cast(0.0)); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = 0 - }; - - testValInfo.fillExpectedTestValues(); - auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - std::swap(testValInfo.a, testValInfo.b); - testValInfo.fillExpectedTestValues(); - auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - return { firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed }; - } - - EmulatedFloat64TestOutput emulatedFloat64BothValuesInfTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float32_t inf32 = std::numeric_limits::infinity(); - const float64_t inf64 = std::numeric_limits::infinity(); - testValInfo.a = emulated_float64_t::create(inf64); - testValInfo.b = emulated_float64_t::create(inf64); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = inf32 - //.float64 = inf64 - }; - - testValInfo.fillExpectedTestValues(); - return performEmulatedFloat64Tests(testValInfo, submitter); - } - - EmulatedFloat64TestOutput emulatedFloat64BothValuesNegInfTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float32_t inf32 = -std::numeric_limits::infinity(); - const float64_t inf64 = -std::numeric_limits::infinity(); - testValInfo.a = emulated_float64_t::create(inf64); - testValInfo.b = emulated_float64_t::create(inf64); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = inf32 - //.float64 = inf64 - }; - - testValInfo.fillExpectedTestValues(); - return performEmulatedFloat64Tests(testValInfo, submitter); - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsInfOtherIsNegInfTest(EF64Submitter& submitter) - { - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - EmulatedFloat64TestValuesInfo testValInfo; - const float64_t inf64 = -std::numeric_limits::infinity(); - testValInfo.a = emulated_float64_t::create(inf64); - testValInfo.b = emulated_float64_t::create(inf64); - testValInfo.constrTestValues = { - .int32 = 0, - .int64 = 0, - .uint32 = 0, - .uint64 = 0, - .float32 = 0 - //.float64 = inf64 - }; - - testValInfo.fillExpectedTestValues(); - auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - std::swap(testValInfo.a, testValInfo.b); - testValInfo.fillExpectedTestValues(); - auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - return { firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed }; - } - - // TODO: fix - EmulatedFloat64TestOutput emulatedFloat64BNaNTest(EF64Submitter& submitter) - { - EmulatedFloat64TestOutput output = { true, true }; - smart_refctd_ptr semaphore = m_device->createSemaphore(0); - - for (uint32_t i = 0u; i < EmulatedFloat64TestIterations; ++i) - { - std::random_device rd; - std::mt19937 mt(rd()); - - std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); - std::uniform_real_distribution f32Distribution(-100000.0f, 100000.0f); - std::uniform_real_distribution f64Distribution(-100000.0, 100000.0); - - EmulatedFloat64TestValuesInfo testValInfo; - double aTmp = f64Distribution(mt); - double bTmp = std::numeric_limits::quiet_NaN(); - testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); - testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); - testValInfo.constrTestValues.int32 = i32Distribution(mt); - testValInfo.constrTestValues.int64 = i64Distribution(mt); - testValInfo.constrTestValues.uint32 = u32Distribution(mt); - testValInfo.constrTestValues.uint64 = u64Distribution(mt); - testValInfo.constrTestValues.float32 = f32Distribution(mt); - //testValInfo.constrTestValues.float64 = f64Distribution(mt); - - testValInfo.fillExpectedTestValues(); - auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); - - if (!singleTestOutput.cpuTestsSucceed) - output.cpuTestsSucceed = false; - if (!singleTestOutput.gpuTestsSucceed) - output.gpuTestsSucceed = false; - } - - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsInfTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - return distribution(mt); - }; - - auto getInfinity = []() - { - return std::numeric_limits::infinity(); - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getInfinity); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getInfinity, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsNegInfTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - + if (!shader) + base.logFail("Failed to load precompiled \"test\" shader!\n"); + + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1}}; + smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + base.logFail("Failed to create a Descriptor Layout!\n"); + + SPushConstantRange pushConstantRanges[] = { + {.stageFlags = ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstants)}}; + m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); + if (!m_pplnLayout) + base.logFail("Failed to create a Pipeline Layout!\n"); - return distribution(mt); - }; - - auto getNegInfinity = []() - { - return -std::numeric_limits::infinity(); - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegInfinity); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegInfinity, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsZeroTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - return distribution(mt); - }; - - auto getZero = []() - { - return 0.0; - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getZero); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getZero, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - EmulatedFloat64TestOutput emulatedFloat64OneValIsNegZeroTest(EF64Submitter& submitter) - { - auto getRandomFloat64 = []() - { - static std::random_device rd; - static std::mt19937 mt(rd()); - static std::uniform_real_distribution distribution(-100000.0, 100000.0); - - return distribution(mt); - }; - - auto getNegZero = []() - { - return -0.0; - }; - - EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegZero); - EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegZero, getRandomFloat64); - - EmulatedFloat64TestOutput output; - output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; - output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; - return output; - } - - template - EmulatedFloat64TestOutput performEmulatedFloat64Tests(EmulatedFloat64TestValuesInfo& testValInfo, EF64Submitter& submitter) - { - emulated_float64_t a = testValInfo.a; - emulated_float64_t b = testValInfo.b; - - const TestValues cpuTestValues = { - .int32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int32).data, - .int64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int64).data, - .uint32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint32).data, - .uint64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint64).data, - .float32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float32).data, - .float64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float64).data, - .additionVal = (a + b).data, - .substractionVal = (a - b).data, - .multiplicationVal = (a * b).data, - .divisionVal = (a / b).data, - .lessOrEqualVal = a <= b, - .greaterOrEqualVal = a >= b, - .equalVal = a == b, - .notEqualVal = a != b, - .lessVal = a < b, - .greaterVal = a > b - }; - - EmulatedFloat64TestOutput output; - - // cpu validation - output.cpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, cpuTestValues); - - // gpu validation - PushConstants pc; - pc.a = reinterpret_cast(a); - pc.b = reinterpret_cast(b); - pc.constrTestVals = testValInfo.constrTestValues; - - submitter.setPushConstants(pc); - auto gpuTestValues = submitter.submitGetGPUTestValues(); - - output.gpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, gpuTestValues); - - return output; - } - - class EF64Benchmark final - { - public: - EF64Benchmark(CompatibilityTest& base) - { - m_device = base.m_device; - m_logger = base.m_logger; - m_api = base.m_api; - - // setting up pipeline in the constructor - m_queueFamily = base.getComputeQueue()->getFamilyIndex(); - m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - //core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) - base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); - - // Load shaders, set up pipeline { - smart_refctd_ptr shader; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = base.m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - // this time we load a shader directly from a file - auto key = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get()); - auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - base.logFail("Could not load shader!"); - assert(0); - } - - // It would be super weird if loading a shader from a file produced more than 1 asset - assert(assets.size() == 1); - shader = IAsset::castDown(assets[0]); - } - - if (!shader) - base.logFail("Failed to load precompiled \"benchmark\" shader!\n"); - - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - { - .binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1 - } - }; - smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); - if (!dsLayout) - base.logFail("Failed to create a Descriptor Layout!\n"); - - SPushConstantRange pushConstantRanges[] = { - { - .stageFlags = ShaderStage::ESS_COMPUTE, - .offset = 0, - .size = sizeof(BenchmarkPushConstants) - } - }; - m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); - if (!m_pplnLayout) - base.logFail("Failed to create a Pipeline Layout!\n"); - - { - IGPUComputePipeline::SCreationParams params = {}; - params.layout = m_pplnLayout.get(); - params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; - params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!base.m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) - base.logFail("Failed to create pipelines (compile & link shaders)!\n"); - - if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_pipeline->getExecutableInfo()); - base.m_logger->log("EF64Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str()); - } - } - - // Allocate the memory - { - static_assert(sizeof(float64_t) == sizeof(benchmark_emulated_float64_t)); - constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * - BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(float64_t); - - nbl::video::IGPUBuffer::SCreationParams params = {}; - params.size = BufferSize; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - smart_refctd_ptr dummyBuff = base.m_device->createBuffer(std::move(params)); - if (!dummyBuff) - base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - - dummyBuff->setObjectDebugName("benchmark buffer"); - - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); - - m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_allocation.isValid()) - base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); - - m_ds = pool->createDescriptorSet(std::move(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo info[1]; - info[0].desc = smart_refctd_ptr(dummyBuff); - info[0].info.buffer = { .offset = 0,.size = BufferSize }; - IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} - }; - base.m_device->updateDescriptorSets(writes, {}); - } - } + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pplnLayout.get(); + params.shader.entryPoint = "main"; + params.shader.shader = shader.get(); + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + } + if (!base.m_device->createComputePipelines(nullptr, {¶ms, 1}, &m_pipeline)) + base.logFail("Failed to create pipelines (compile & link shaders)!\n"); + + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + auto report = system::to_string(m_pipeline->getExecutableInfo()); + base.m_logger->log("EF64Submitter Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, report.c_str()); + } } - IQueryPool::SCreationParams queryPoolCreationParams{}; - queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP; - queryPoolCreationParams.queryCount = 2; - queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(queryPoolCreationParams); - - m_computeQueue = m_device->getQueue(m_queueFamily, 0); - } - - void run() - { - m_logger->log("\n\nfloat64_t benchmark result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::NATIVE); - m_logger->log("emulated_float64_t benchmark, fast math enabled result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::EF64_FAST_MATH_ENABLED); - m_logger->log("emulated_float64_t benchmark, fast math disabled result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::EF64_FAST_MATH_DISABLED); - // every subgroup with even ID do calculations with the `emulated_float64_t` type, other subgroups do calculations with float64_t - m_logger->log("emulated_float64_t benchmark, subgroup divided work result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::SUBGROUP_DIVIDED_WORK); - // every item does calculations with both emulated and native types - m_logger->log("emulated_float64_t benchmark, interleaved result:", ILogger::ELL_PERFORMANCE); - performBenchmark(EF64_BENCHMARK_MODE::INTERLEAVED); - } - - private: - void performBenchmark(EF64_BENCHMARK_MODE mode) - { - m_device->waitIdle(); - - recordTimestampQueryCmdBuffers(); - - uint64_t semaphoreCounter = 0; - smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); - - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT } }; - - IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} }; - beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin; - beforeTimestapSubmitInfo[0].signalSemaphores = signals; - beforeTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = { {.cmdbuf = m_timestampAfterCmdBuff.get()} }; - afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd; - afterTimestapSubmitInfo[0].signalSemaphores = signals; - afterTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; - benchmarkSubmitInfos[0].commandBuffers = cmdbufs; - benchmarkSubmitInfos[0].signalSemaphores = signals; - benchmarkSubmitInfos[0].waitSemaphores = waits; - - - m_pushConstants.benchmarkMode = mode; - recordCmdBuff(); - - // warmup runs - for (int i = 0; i < WarmupIterations; ++i) - { - if(i == 0) - m_api->startCapture(); - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - if (i == 0) - m_api->endCapture(); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(beforeTimestapSubmitInfo); - - // actual benchmark runs - for (int i = 0; i < Iterations; ++i) - { - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(afterTimestapSubmitInfo); - - m_device->waitIdle(); - - const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); - const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - - m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); - } - - void recordCmdBuff() - { - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); - m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->bindComputePipeline(m_pipeline.get()); - m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); - m_cmdbuf->endDebugMarker(); - m_cmdbuf->end(); - } - - void recordTimestampQueryCmdBuffers() - { - static bool firstInvocation = true; - - if (!firstInvocation) + // Allocate the memory { - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + constexpr size_t BufferSize = sizeof(TestValues); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + smart_refctd_ptr outputBuff = base.m_device->createBuffer(std::move(params)); + if (!outputBuff) + base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + outputBuff->setObjectDebugName("emulated_float64_t output buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs(); + reqs.memoryTypeBits &= base.m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_allocation = base.m_device->allocate(reqs, outputBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(outputBuff->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(outputBuff); + info[0].info.buffer = {.offset = 0, .size = BufferSize}; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; + base.m_device->updateDescriptorSets(writes, {}); + } } - m_timestampBeforeCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdBuff->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdBuff->end(); - - m_timestampAfterCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdBuff->end(); - - firstInvocation = false; - } - - uint64_t calcTimeElapsed() - { - uint64_t timestamps[2]; - const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, ×tamps, sizeof(uint64_t), flags); - return timestamps[1] - timestamps[0]; - } - - private: - core::smart_refctd_ptr m_api; - smart_refctd_ptr m_device; - smart_refctd_ptr m_logger; - - nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; - smart_refctd_ptr m_cmdpool = nullptr; - smart_refctd_ptr m_cmdbuf = nullptr; - smart_refctd_ptr m_ds = nullptr; - smart_refctd_ptr m_pplnLayout = nullptr; - BenchmarkPushConstants m_pushConstants; - smart_refctd_ptr m_pipeline; - - smart_refctd_ptr m_timestampBeforeCmdBuff = nullptr; - smart_refctd_ptr m_timestampAfterCmdBuff = nullptr; - smart_refctd_ptr m_queryPool = nullptr; - - uint32_t m_queueFamily; - IQueue* m_computeQueue; - static constexpr int WarmupIterations = 1000; - static constexpr int Iterations = 1000; - using benchmark_emulated_float64_t = emulated_float64_t; - }; - - template - inline bool logFail(const char* msg, Args&&... args) - { - m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); - return false; - } - - std::ofstream m_logFile; + if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)) + base.logFail("Failed to map the Device Memory!\n"); + } + + // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches + const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); + if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + base.m_device->invalidateMappedMemoryRanges(1, &memoryRange); + + assert(memoryRange.valid() && memoryRange.length >= sizeof(TestValues)); + + m_queue = m_base.m_device->getQueue(m_queueFamily, 0); + } + + ~EF64Submitter() + { + m_allocation.memory->unmap(); + } + + void setPushConstants(PushConstants& pc) + { + m_pushConstants = pc; + } + + TestValues submitGetGPUTestValues() + { + // record command buffer + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::NONE); + m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipeline.get()); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PushConstants), &m_pushConstants); + m_cmdbuf->dispatch(WORKGROUP_SIZE, 1, 1); + m_cmdbuf->endDebugMarker(); + m_cmdbuf->end(); + + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; + submitInfos[0].commandBuffers = cmdbufs; + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = m_semaphore.get(), .value = ++m_semaphoreCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + submitInfos[0].signalSemaphores = signals; + + m_base.m_api->startCapture(); + m_queue->submit(submitInfos); + m_base.m_api->endCapture(); + + m_base.m_device->waitIdle(); + TestValues output; + std::memcpy(&output, static_cast*>(m_allocation.memory->getMappedPointer()), sizeof(TestValues)); + m_base.m_device->waitIdle(); + + return output; + } + + private: + uint32_t m_queueFamily; + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_cmdbuf = nullptr; + smart_refctd_ptr m_cmdpool = nullptr; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_pplnLayout = nullptr; + PushConstants m_pushConstants; + CompatibilityTest& m_base; + smart_refctd_ptr m_pipeline; + smart_refctd_ptr m_semaphore; + IQueue* m_queue; + uint64_t m_semaphoreCounter; + }; + + void emulated_float64_tests() + { + EF64Submitter submitter(*this); + + auto printTestOutput = [this](const std::string& functionName, const EmulatedFloat64TestOutput& testResult) + { + std::cout << functionName << ": " << std::endl; + + if (!testResult.cpuTestsSucceed) + logFail("Incorrect CPU determinated values!"); + else + m_logger->log("Correct CPU determinated values!", ILogger::ELL_PERFORMANCE); + + if (!testResult.gpuTestsSucceed) + logFail("Incorrect GPU determinated values!"); + else + m_logger->log("Correct GPU determinated values!", ILogger::ELL_PERFORMANCE); + }; + + m_logFile.open("EmulatedFloatTestLog.txt", std::ios::out | std::ios::trunc); + if (!m_logFile.is_open()) + m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR); + + printTestOutput("emulatedFloat64RandomValuesTest", emulatedFloat64RandomValuesTest(submitter)); + printTestOutput("emulatedFloat64RandomValuesTestContrastingExponents", emulatedFloat64RandomValuesTestContrastingExponents(submitter)); + printTestOutput("emulatedFloat64NegAndPosZeroTest", emulatedFloat64NegAndPosZeroTest(submitter)); + printTestOutput("emulatedFloat64BothValuesInfTest", emulatedFloat64BothValuesInfTest(submitter)); + printTestOutput("emulatedFloat64BothValuesNegInfTest", emulatedFloat64BothValuesNegInfTest(submitter)); + printTestOutput("emulatedFloat64OneValIsInfOtherIsNegInfTest", emulatedFloat64OneValIsInfOtherIsNegInfTest(submitter)); + printTestOutput("emulatedFloat64OneValIsInfTest", emulatedFloat64OneValIsInfTest(submitter)); + printTestOutput("emulatedFloat64OneValIsNegInfTest", emulatedFloat64OneValIsNegInfTest(submitter)); + if (false) // doesn't work for some reason + fast math is enabled by default + printTestOutput("emulatedFloat64BNaNTest", emulatedFloat64BNaNTest(submitter)); + printTestOutput("emulatedFloat64BInfTest", emulatedFloat64OneValIsZeroTest(submitter)); + printTestOutput("emulatedFloat64BNegInfTest", emulatedFloat64OneValIsNegZeroTest(submitter)); + + m_logFile.close(); + } + + template + struct EmulatedFloat64TestValuesInfo + { + emulated_float64_t a; + emulated_float64_t b; + ConstructorTestValues constrTestValues; + TestValues expectedTestValues; + + void fillExpectedTestValues() + { + double aAsDouble = reinterpret_cast(a); + double bAsDouble = reinterpret_cast(b); + + expectedTestValues.a = a.data; + expectedTestValues.b = b.data; + + expectedTestValues.int32CreateVal = bit_cast(double(constrTestValues.int32)); + expectedTestValues.int64CreateVal = bit_cast(double(constrTestValues.int64)); + expectedTestValues.uint32CreateVal = bit_cast(double(constrTestValues.uint32)); + expectedTestValues.uint64CreateVal = bit_cast(double(constrTestValues.uint64)); + expectedTestValues.float32CreateVal = bit_cast(double(constrTestValues.float32)); + expectedTestValues.float64CreateVal = bit_cast(constrTestValues.float64); + expectedTestValues.additionVal = emulated_float64_t::create(aAsDouble + bAsDouble).data; + expectedTestValues.substractionVal = emulated_float64_t::create(aAsDouble - bAsDouble).data; + expectedTestValues.multiplicationVal = emulated_float64_t::create(aAsDouble * bAsDouble).data; + expectedTestValues.divisionVal = emulated_float64_t::create(aAsDouble / bAsDouble).data; + expectedTestValues.lessOrEqualVal = aAsDouble <= bAsDouble; + expectedTestValues.greaterOrEqualVal = aAsDouble >= bAsDouble; + expectedTestValues.equalVal = aAsDouble == bAsDouble; + expectedTestValues.notEqualVal = aAsDouble != bAsDouble; + expectedTestValues.lessVal = aAsDouble < bAsDouble; + expectedTestValues.greaterVal = aAsDouble > bAsDouble; + } + }; + + struct EmulatedFloat64TestOutput + { + bool cpuTestsSucceed; + bool gpuTestsSucceed; + }; + + EmulatedFloat64TestOutput emulatedFloat64LoopedTests_impl(EF64Submitter& submitter, + const uint32_t iterations, + const std::function& determineValueA, + const std::function& determineValueB) + { + EmulatedFloat64TestOutput output = {true, true}; + + std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_real_distribution fDistribution(-100000.0, 100000.0); + + std::random_device rd; + std::mt19937 mt(rd()); + + for (uint32_t i = 0u; i < iterations; ++i) + { + // generate random test values + EmulatedFloat64TestValuesInfo testValInfo; + double aTmp = determineValueA(); + double bTmp = determineValueB(); + testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); + testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); + testValInfo.constrTestValues.int32 = i32Distribution(mt); + testValInfo.constrTestValues.int64 = i64Distribution(mt); + testValInfo.constrTestValues.uint32 = u32Distribution(mt); + testValInfo.constrTestValues.uint64 = u64Distribution(mt); + testValInfo.constrTestValues.float32 = fDistribution(mt); + testValInfo.constrTestValues.float64 = fDistribution(mt); + + testValInfo.fillExpectedTestValues(); + auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + if (!singleTestOutput.cpuTestsSucceed) + output.cpuTestsSucceed = false; + if (!singleTestOutput.gpuTestsSucceed) + output.gpuTestsSucceed = false; + } + + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64RandomValuesTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + + return distribution(mt); + }; + + return emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations, getRandomFloat64, getRandomFloat64); + } + + EmulatedFloat64TestOutput emulatedFloat64RandomValuesTestContrastingExponents(EF64Submitter& submitter) + { + auto getRandomSmallFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-0.01, 0.01); + + return distribution(mt); + }; + + auto getRandomLargeFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(1000000000.0, 2000000000.0); + static std::uniform_int_distribution coinFlipDistribution(0, 1); + + double output = distribution(mt); + if (coinFlipDistribution(mt)) + output = -output; + + return output; + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomSmallFloat64, getRandomLargeFloat64); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomLargeFloat64, getRandomSmallFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64BothValuesNaNTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float32_t nan32 = std::numeric_limits::quiet_NaN(); + const float64_t nan64 = std::numeric_limits::quiet_NaN(); + testValInfo.a = emulated_float64_t::create(nan64); + testValInfo.b = emulated_float64_t::create(nan64); + testValInfo.constrTestValues = { + .int32 = std::bit_cast(nan32), + .int64 = std::bit_cast(nan64), + .uint32 = std::bit_cast(nan32), + .uint64 = std::bit_cast(nan64), + .float32 = nan32 + //.float64 = nan64 + }; + + testValInfo.fillExpectedTestValues(); + return performEmulatedFloat64Tests(testValInfo, submitter); + } + + EmulatedFloat64TestOutput emulatedFloat64NegAndPosZeroTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + testValInfo.a = emulated_float64_t::create(ieee754::traits::signMask); + testValInfo.b = emulated_float64_t::create(std::bit_cast(0.0)); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = 0}; + + testValInfo.fillExpectedTestValues(); + auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + std::swap(testValInfo.a, testValInfo.b); + testValInfo.fillExpectedTestValues(); + auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + return {firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed}; + } + + EmulatedFloat64TestOutput emulatedFloat64BothValuesInfTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float32_t inf32 = std::numeric_limits::infinity(); + const float64_t inf64 = std::numeric_limits::infinity(); + testValInfo.a = emulated_float64_t::create(inf64); + testValInfo.b = emulated_float64_t::create(inf64); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = inf32 + //.float64 = inf64 + }; + + testValInfo.fillExpectedTestValues(); + return performEmulatedFloat64Tests(testValInfo, submitter); + } + + EmulatedFloat64TestOutput emulatedFloat64BothValuesNegInfTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float32_t inf32 = -std::numeric_limits::infinity(); + const float64_t inf64 = -std::numeric_limits::infinity(); + testValInfo.a = emulated_float64_t::create(inf64); + testValInfo.b = emulated_float64_t::create(inf64); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = inf32 + //.float64 = inf64 + }; + + testValInfo.fillExpectedTestValues(); + return performEmulatedFloat64Tests(testValInfo, submitter); + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsInfOtherIsNegInfTest(EF64Submitter& submitter) + { + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + EmulatedFloat64TestValuesInfo testValInfo; + const float64_t inf64 = -std::numeric_limits::infinity(); + testValInfo.a = emulated_float64_t::create(inf64); + testValInfo.b = emulated_float64_t::create(inf64); + testValInfo.constrTestValues = { + .int32 = 0, + .int64 = 0, + .uint32 = 0, + .uint64 = 0, + .float32 = 0 + //.float64 = inf64 + }; + + testValInfo.fillExpectedTestValues(); + auto firstTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + std::swap(testValInfo.a, testValInfo.b); + testValInfo.fillExpectedTestValues(); + auto secondTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + return {firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed, firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed}; + } + + // TODO: fix + EmulatedFloat64TestOutput emulatedFloat64BNaNTest(EF64Submitter& submitter) + { + EmulatedFloat64TestOutput output = {true, true}; + smart_refctd_ptr semaphore = m_device->createSemaphore(0); + + for (uint32_t i = 0u; i < EmulatedFloat64TestIterations; ++i) + { + std::random_device rd; + std::mt19937 mt(rd()); + + std::uniform_int_distribution i32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution i64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u32Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_int_distribution u64Distribution(-std::numeric_limits::max(), std::numeric_limits::max()); + std::uniform_real_distribution f32Distribution(-100000.0f, 100000.0f); + std::uniform_real_distribution f64Distribution(-100000.0, 100000.0); + + EmulatedFloat64TestValuesInfo testValInfo; + double aTmp = f64Distribution(mt); + double bTmp = std::numeric_limits::quiet_NaN(); + testValInfo.a.data = reinterpret_cast::storage_t&>(aTmp); + testValInfo.b.data = reinterpret_cast::storage_t&>(bTmp); + testValInfo.constrTestValues.int32 = i32Distribution(mt); + testValInfo.constrTestValues.int64 = i64Distribution(mt); + testValInfo.constrTestValues.uint32 = u32Distribution(mt); + testValInfo.constrTestValues.uint64 = u64Distribution(mt); + testValInfo.constrTestValues.float32 = f32Distribution(mt); + //testValInfo.constrTestValues.float64 = f64Distribution(mt); + + testValInfo.fillExpectedTestValues(); + auto singleTestOutput = performEmulatedFloat64Tests(testValInfo, submitter); + + if (!singleTestOutput.cpuTestsSucceed) + output.cpuTestsSucceed = false; + if (!singleTestOutput.gpuTestsSucceed) + output.gpuTestsSucceed = false; + } + + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsInfTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + return distribution(mt); + }; + + auto getInfinity = []() + { + return std::numeric_limits::infinity(); + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getInfinity); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getInfinity, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsNegInfTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + + return distribution(mt); + }; + + auto getNegInfinity = []() + { + return -std::numeric_limits::infinity(); + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegInfinity); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegInfinity, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsZeroTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + return distribution(mt); + }; + + auto getZero = []() + { + return 0.0; + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getZero); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getZero, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + EmulatedFloat64TestOutput emulatedFloat64OneValIsNegZeroTest(EF64Submitter& submitter) + { + auto getRandomFloat64 = []() + { + static std::random_device rd; + static std::mt19937 mt(rd()); + static std::uniform_real_distribution distribution(-100000.0, 100000.0); + + return distribution(mt); + }; + + auto getNegZero = []() + { + return -0.0; + }; + + EmulatedFloat64TestOutput firstTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getRandomFloat64, getNegZero); + EmulatedFloat64TestOutput secondTestOutput = emulatedFloat64LoopedTests_impl(submitter, EmulatedFloat64TestIterations / 2, getNegZero, getRandomFloat64); + + EmulatedFloat64TestOutput output; + output.cpuTestsSucceed = firstTestOutput.cpuTestsSucceed && secondTestOutput.cpuTestsSucceed; + output.gpuTestsSucceed = firstTestOutput.gpuTestsSucceed && secondTestOutput.gpuTestsSucceed; + return output; + } + + template + EmulatedFloat64TestOutput performEmulatedFloat64Tests(EmulatedFloat64TestValuesInfo& testValInfo, EF64Submitter& submitter) + { + emulated_float64_t a = testValInfo.a; + emulated_float64_t b = testValInfo.b; + + const TestValues cpuTestValues = { + .int32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int32).data, + .int64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.int64).data, + .uint32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint32).data, + .uint64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.uint64).data, + .float32CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float32).data, + .float64CreateVal = emulated_float64_t::create(testValInfo.constrTestValues.float64).data, + .additionVal = (a + b).data, + .substractionVal = (a - b).data, + .multiplicationVal = (a * b).data, + .divisionVal = (a / b).data, + .lessOrEqualVal = a <= b, + .greaterOrEqualVal = a >= b, + .equalVal = a == b, + .notEqualVal = a != b, + .lessVal = a + b}; + + EmulatedFloat64TestOutput output; + + // cpu validation + output.cpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, cpuTestValues); + + // gpu validation + PushConstants pc; + pc.a = reinterpret_cast(a); + pc.b = reinterpret_cast(b); + pc.constrTestVals = testValInfo.constrTestValues; + + submitter.setPushConstants(pc); + auto gpuTestValues = submitter.submitGetGPUTestValues(); + + output.gpuTestsSucceed = compareEmulatedFloat64TestValues(testValInfo.expectedTestValues, gpuTestValues); + + return output; + } + + void runEF64Benchmarks() + { + constexpr uint32_t WarmupDispatches = 1000; + constexpr uint64_t TargetBudgetMs = 400; // ~400ms per row + + Aggregator agg(m_logger, m_device, m_physicalDevice, getComputeQueue()->getFamilyIndex()); + agg.applyCli({ + .argv = this->argv, + .defaultOutputPath = "EF64Bench.json", + .appName = "64_EmulatedFloatTest", + }); + + const auto shaderKey = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get()); + auto shaderVariant = GPUBenchmarkHelper::ShaderVariant::Precompiled(shaderKey); + + // One bench instance per mode -> one report row per mode. std::array + // gives stack-allocated, pointer-stable storage; no parallel + // benchPtrs vector needed since the aggregator iterates the span + // directly. + constexpr std::pair kModes[] = { + {EF64_BENCHMARK_MODE::NATIVE, "native"}, + {EF64_BENCHMARK_MODE::EF64_FAST_MATH_ENABLED, "emulated, fast-math"}, + {EF64_BENCHMARK_MODE::EF64_FAST_MATH_DISABLED, "emulated, strict"}, + {EF64_BENCHMARK_MODE::SUBGROUP_DIVIDED_WORK, "subgroup-divided"}, + {EF64_BENCHMARK_MODE::INTERLEAVED, "interleaved"}, + }; + constexpr size_t N = std::size(kModes); + std::vector benches; + benches.reserve(N); + for (size_t i = 0; i < N; ++i) + { + const auto& [mode, leaf] = kModes[i]; + benches.emplace_back(agg, CEF64Benchmark::SetupData{ + .assetMgr = m_assetMgr, + .name = {"EF64", leaf}, + .mode = mode, + .variant = shaderVariant, + .warmupDispatches = WarmupDispatches, + .targetBudgetMs = TargetBudgetMs, + }); + } + + const RunContext ctx = { + .shape = CEF64Benchmark::shape(), + .targetBudgetMs = TargetBudgetMs, + .sectionLabel = CEF64Benchmark::kSectionLabel, + }; + agg.runSessionAndReport(Aggregator::makeSpan(benches, ctx)); + } + + + template + inline bool logFail(const char* msg, Args&&... args) + { + m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); + return false; + } + + std::ofstream m_logFile; }; -NBL_MAIN_FUNC(CompatibilityTest) \ No newline at end of file +NBL_MAIN_FUNC(CompatibilityTest) diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt new file mode 100644 index 000000000..0709770be --- /dev/null +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -0,0 +1,142 @@ +if(NBL_BUILD_IMGUI) + set(NBL_EXTRA_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/src/transform.cpp" + ) + + set(NBL_INCLUDE_SERACH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" + ) + + list(APPEND NBL_LIBRARIES + imtestengine + imguizmo + "${NBL_EXT_IMGUI_UI_LIB}" + Nabla::ext::FullScreenTriangle + ) + + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? + nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}") + + if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) + endif() + + # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet + # LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) + set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") + set(DEPENDS + app_resources/hlsl/common.hlsl + app_resources/hlsl/debug_vis.hlsl + app_resources/hlsl/drawing.hlsl + app_resources/hlsl/silhouette.hlsl + app_resources/hlsl/utils.hlsl + app_resources/hlsl/triangle_sampling.hlsl + app_resources/hlsl/parallelogram_sampling.hlsl + app_resources/hlsl/pyramid_sampling.hlsl + app_resources/hlsl/obb_face_sampling.hlsl + + app_resources/hlsl/pyramid_sampling/bilinear.hlsl + + app_resources/hlsl/solid_angle_vis.frag.hlsl + app_resources/hlsl/ray_vis.frag.hlsl + + app_resources/hlsl/benchmark/benchmark.comp.hlsl + app_resources/hlsl/benchmark/common.hlsl + ) + target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS}) + set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON) + + set(SM 6_8) + set(SA_VIS "app_resources/hlsl/solid_angle_vis.frag.hlsl") + set(RAY_VIS "app_resources/hlsl/ray_vis.frag.hlsl") + set(BENCH "app_resources/hlsl/benchmark/benchmark.comp.hlsl") + + set(JSON [=[ + [ + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_obb_face", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_obb_face_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + + {"INPUT": "${RAY_VIS}", "KEY": "ray_vis", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${RAY_VIS}", "KEY": "ray_vis_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_sa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_psa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_para", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_bilinear", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_proj_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_silhouette", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_obb_face_direct", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT"]}, + ] + ]=]) + string(CONFIGURE "${JSON}" JSON) + + set(COMPILE_OPTIONS + -I "${CMAKE_CURRENT_SOURCE_DIR}" + -Zi -Qembed_debug + + # -fspv-debug=file + # -fspv-debug=source + # -fspv-debug=line + -enable-16bit-types + ) + + NBL_CREATE_NSC_COMPILE_RULES( + TARGET ${EXECUTABLE_NAME}SPIRV + LINK_TO ${EXECUTABLE_NAME} + DEPENDS ${DEPENDS} + BINARY_DIR ${OUTPUT_DIRECTORY} + MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT + COMMON_OPTIONS ${COMPILE_OPTIONS} + OUTPUT_VAR KEYS + INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp + NAMESPACE nbl::this_example::builtin::build + INPUTS ${JSON} + ) + + NBL_CREATE_RESOURCE_ARCHIVE( + NAMESPACE nbl::this_example::builtin::build + TARGET ${EXECUTABLE_NAME}_builtinsBuild + LINK_TO ${EXECUTABLE_NAME} + BIND ${OUTPUT_DIRECTORY} + BUILTINS ${KEYS} + ) +endif() \ No newline at end of file diff --git a/73_SolidAngleVisualizer/README.md b/73_SolidAngleVisualizer/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl new file mode 100644 index 000000000..c2239037b --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -0,0 +1,424 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ + +#include "common.hlsl" +#include "silhouette.hlsl" +#include + +using namespace nbl::hlsl; + +// ============================================================================ +// SphereDrawer: all visualization primitives for the solid angle visualizer. +// All methods are static and read VisContext for ndc/spherePos/aaWidth. +// ============================================================================ +struct SphereDrawer +{ + // ======================================================================== + // Coordinate helpers + // ======================================================================== + + // Project sphere point to circle-space (doesn't change Z) + static float32_t3 sphereToCircle(float32_t3 spherePoint) + { + if (spherePoint.z >= 0.0f) + { + return float32_t3(spherePoint.xy * CIRCLE_RADIUS, spherePoint.z); + } + else + { + float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); + float32_t uv2Plus1 = r2 + 1.0f; + return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS, spherePoint.z); + } + } + + // ======================================================================== + // Primitives + // ======================================================================== + + // Great circle arc between two points on the sphere + static float32_t drawGreatCircleArc(float32_t3 points[2], float32_t width = 0.01f) + { + float32_t3 v0 = normalize(points[0]); + float32_t3 v1 = normalize(points[1]); + float32_t3 ndc = normalize(VisContext::spherePos()); + + float32_t3 arcNormal = normalize(cross(v0, v1)); + float32_t dist = abs(dot(ndc, arcNormal)); + + float32_t dotMid = dot(v0, v1); + bool onArc = (dot(ndc, v0) >= dotMid) && (dot(ndc, v1) >= dotMid); + + if (!onArc) + return 0.0f; + + float32_t avgDepth = (length(points[0]) + length(points[1])) * 0.5f; + float32_t depthScale = 3.0f / avgDepth; + + width = min(width * depthScale, 0.02f); + const float32_t aaWidth = VisContext::aaWidth(); + float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + + return alpha; + } + + // 2D cross marker + static float32_t drawCross2D(float32_t2 fragPos, float32_t2 center, float32_t size, float32_t thickness) + { + float32_t2 ndc = abs(fragPos - center); + + bool inHorizontal = (ndc.x <= size && ndc.y <= thickness); + bool inVertical = (ndc.y <= size && ndc.x <= thickness); + + return (inHorizontal || inVertical) ? 1.0f : 0.0f; + } + + // Dot (circle) with optional inner hollow for hidden corners + static float32_t4 drawDot(float32_t3 cornerNDCPos, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) + { + float32_t4 color = float32_t4(0, 0, 0, 0); + const float32_t aaWidth = VisContext::aaWidth(); + const float32_t2 ndc = VisContext::ndc(); + const float32_t dist = length(ndc - cornerNDCPos.xy); + + float32_t outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); + + if (outerAlpha <= 0.0f) + return color; + + color += float32_t4(dotColor * outerAlpha, outerAlpha); + + if (cornerNDCPos.z < 0.0f && innerDotSize > 0.0) + { + float32_t innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, innerDotSize + aaWidth, dist); + innerAlpha *= outerAlpha; + color -= float32_t4(hlsl::promote(innerAlpha), 0.0f); + } + + return color; + } + + // Line segment in NDC space + static float32_t lineSegment(float32_t2 ndc, float32_t2 a, float32_t2 b, float32_t thickness) + { + float32_t2 pa = ndc - a; + float32_t2 ba = b - a; + float32_t h = saturate(dot(pa, ba) / dot(ba, ba)); + float32_t dist = length(pa - ba * h); + return smoothstep(thickness, thickness * 0.5, dist); + } + + // Draw half of a great circle (visible half of a lune boundary) + static float32_t4 drawGreatCircleHalf(float32_t3 normal, float32_t3 axis3, float32_t3 color, float32_t thickness) + { + // Point is on great circle if dot(point, normal) ~= 0 + // Only draw the half where dot(point, axis3) > 0 (toward silhouette) + const float32_t3 spherePos = VisContext::spherePos(); + const float32_t aaWidth = VisContext::aaWidth(); + + float32_t dist = abs(dot(spherePos, normal)); + float32_t sideFade = smoothstep(-0.1f, 0.1f, dot(spherePos, axis3)); + float32_t alpha = (1.0f - smoothstep(thickness - aaWidth, thickness + aaWidth, dist)) * sideFade; + return float32_t4(color * alpha, alpha); + } + + // Unit-circle ring + static float32_t4 drawRing(float32_t2 ndc) + { + const float32_t aaWidth = VisContext::aaWidth(); + float32_t ringWidth = 0.003f; + float32_t positionLength = length(ndc); + + float32_t ringDistance = abs(positionLength - CIRCLE_RADIUS); + float32_t ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); + return ringAlpha * float32_t4(0, 0, 0, 1); + } + + // ======================================================================== + // Composite drawing helpers + // ======================================================================== + + // Silhouette edge with color from LUT + static float32_t4 drawEdge(uint32_t originalEdgeIdx, float32_t3 pts[2], float32_t width = 0.003f) + { + float32_t alpha = drawGreatCircleArc(pts, width); + return float32_t4(colorLUT[originalEdgeIdx] * alpha, alpha); + } + + static float32_t4 drawCorner(float32_t3 cornerPos, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) + { + float32_t3 cornerCirclePos = sphereToCircle(cornerPos); + return drawDot(cornerCirclePos, dotSize, innerDotSize, dotColor); + } + + // All 8 cube corners as colored dots + static float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t dotSize) + { + float32_t4 color = float32_t4(0, 0, 0, 0); + float32_t innerDotSize = dotSize * 0.5f; + + shapes::OBBView view = shapes::OBBView::create(modelMatrix); + + for (uint32_t i = 0; i < 8; i++) + { + color += drawCorner(normalize(view.getVertex(i)), dotSize, innerDotSize, colorLUT[i]); + } + + return color; + } + + static float32_t4 drawClippedSilhouetteVertices(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count) + { + const float32_t dotSize = 0.03f; + const float32_t2 ndc = VisContext::ndc(); + const float32_t rcpDenom = rcp(float32_t(max(1u, count - 1))); + + float32_t4 color = 0; + + for (uint32_t i = 0; i < count; i++) + { + const float32_t3 cornerCirclePos = sphereToCircle(normalize(vertices[i])); + const float32_t dist = length(ndc - cornerCirclePos.xy); + const float32_t alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); + if (alpha > 0.0f) + { + const float32_t t = float32_t(i) * rcpDenom; + const float32_t3 vertexColor = lerp(float32_t3(1, 0, 0), float32_t3(0, 1, 1), t); + color += float32_t4(vertexColor * alpha, alpha); + } + } + + return color; + } + + // Non-silhouette cube edges (drawn as faint lines) + static float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, uint32_t silEdgeMask) + { + float32_t4 color = 0; + float32_t3 hiddenEdgeColor = float32_t3(0.1, 0.1, 0.1); + + shapes::OBBView view = shapes::OBBView::create(modelMatrix); + + // Enumerate all 12 cube edges: for each of 3 axes, 4 edges parallel to that axis. + // compact (0..3) is the 2-bit corner index with the axis bit stripped out. + // Reconstruct the full corner by re-inserting the axis bit as 0. + NBL_UNROLL + for (uint32_t axis = 0; axis < 3; axis++) + { + NBL_UNROLL + for (uint32_t compact = 0; compact < 4; compact++) + { + uint32_t edgeIdx = axis * 4 + compact; + if (silEdgeMask & (1u << edgeIdx)) + continue; + + // Re-insert the axis bit (as 0) to recover the low corner index + uint32_t below = compact & ((1u << axis) - 1u); + uint32_t above = compact >> axis; + uint32_t corner = (above << (axis + 1u)) | below; + + float32_t3 v0 = normalize(view.getVertex(corner)); + float32_t3 v1 = normalize(view.getVertex(corner | (1u << axis))); + + bool neg0 = v0.z < 0.0f; + bool neg1 = v1.z < 0.0f; + + // fully behind camera + if (neg0 && neg1) + continue; + + float32_t3 p0 = v0; + float32_t3 p1 = v1; + + // clip if one vertex is behind camera + if (neg0 ^ neg1) + { + float32_t t = v0.z / (v0.z - v1.z); + float32_t3 clip = normalize(lerp(v0, v1, t)); + + p0 = neg0 ? clip : v0; + p1 = neg1 ? clip : v1; + } + + float32_t3 pts[2] = {p0, p1}; + float32_t c = drawGreatCircleArc(pts, 0.003f); + color += float32_t4(hiddenEdgeColor * c, c); + } + } + + return color; + } + + // Best caliper edge highlighted in gold + static float32_t4 visualizeBestCaliperEdge(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t bestEdgeIdx) + { + float32_t4 result = float32_t4(0, 0, 0, 0); + + if (bestEdgeIdx >= count) + return result; + + float32_t3 v0 = vertices[bestEdgeIdx]; + float32_t3 v1 = vertices[(bestEdgeIdx + 1) % count]; + + float32_t3 pts[2] = {v0, v1}; + float32_t3 highlightColor = float32_t3(1.0f, 0.8f, 0.0f); + float32_t alpha = drawGreatCircleArc(pts, 0.008f); + result += float32_t4(highlightColor * alpha, alpha); + + return result; + } + + // ======================================================================== + // Sample visualization (sphere dot + parameter-space square overlay) + // ======================================================================== + + static float32_t4 visualizeSample(float32_t3 sampleDir, float32_t2 xi, uint32_t colorIndex, float32_t2 screenUV) + { + float32_t4 accumColor = 0; + float32_t3 sampleColor = colorLUT[colorIndex].rgb; + + // 3D dot on the sphere + float32_t dist3D = distance(sampleDir, normalize(VisContext::spherePos())); + float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); + if (alpha3D > 0.0f) + accumColor += float32_t4(sampleColor * alpha3D, alpha3D); + + // Parameter-space square (PSS) overlay + static const float32_t2 pssSize = float32_t2(0.2, 0.2); + static const float32_t2 pssPos = float32_t2(0.01, 0.01); + bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); + + if (isInsidePSS) + { + // Cross marker at the sample's xi position + float32_t2 xiPixelPos = pssPos + xi * pssSize; + float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); + if (alpha2D > 0.0f) + accumColor += float32_t4(sampleColor * alpha2D, alpha2D); + + // Faint border outline + float32_t2 edgeDist = min(screenUV - pssPos, (pssPos + pssSize) - screenUV); + float32_t borderDist = min(edgeDist.x, edgeDist.y); + float32_t borderAlpha = 1.0f - smoothstep(0.001f, 0.003f, borderDist); + if (borderAlpha > 0.0f) + accumColor += float32_t4(0.3f, 0.3f, 0.3f, 1.0f) * borderAlpha; + } + + return accumColor; + } + + // ======================================================================== + // 3D ray arrow visualization + // ======================================================================== + + // Project 3D point to NDC space + static float32_t2 projectToNDC(float32_t3 worldPos, float32_t4x4 viewProj, float32_t aspect) + { + float32_t4 clipPos = mul(viewProj, float32_t4(worldPos, 1.0)); + clipPos /= clipPos.w; + clipPos.x *= aspect; + return clipPos.xy; + } + + struct ArrowResult + { + float32_t4 color; + float32_t depth; + }; + + // Visualize a ray as an arrow from origin in NDC space. + // Returns color (rgb), intensity (a), and depth. + static ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf, float32_t arrowLength, + float32_t2 ndcPos, float32_t aspect, float32_t4x4 viewProjMatrix) + { + ArrowResult result; + result.color = float32_t4(0, 0, 0, 0); + result.depth = 0.0; // Far plane in reversed-Z + + float32_t3 rayDir = normalize(directionAndPdf.xyz); + float32_t pdf = directionAndPdf.w; + + // Define the 3D line segment + float32_t3 worldStart = rayOrigin; + float32_t3 worldEnd = rayOrigin + rayDir * arrowLength; + + float32_t4 clipStart = mul(viewProjMatrix, float32_t4(worldStart, 1.0)); + float32_t4 clipEnd = mul(viewProjMatrix, float32_t4(worldEnd, 1.0)); + + // Clip against near plane (w = 0 plane in clip space) + // If both points are behind camera, reject + if (clipStart.w <= 0.001 && clipEnd.w <= 0.001) + return result; + + // If line crosses the near plane, clip it + float32_t t0 = 0.0; + float32_t t1 = 1.0; + + if (clipStart.w <= 0.001) + { + float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); + t0 = saturate(t); + clipStart = lerp(clipStart, clipEnd, t0); + worldStart = lerp(worldStart, worldEnd, t0); + } + + if (clipEnd.w <= 0.001) + { + float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); + t1 = saturate(t); + clipEnd = lerp(clipStart, clipEnd, t1); + worldEnd = lerp(worldStart, worldEnd, t1); + } + + // Now check if the clipped segment is valid + if (t0 >= t1) + return result; + + // Perspective divide to NDC + float32_t2 ndcStart = clipStart.xy / clipStart.w; + float32_t2 ndcEnd = clipEnd.xy / clipEnd.w; + + // Apply aspect ratio correction + ndcStart.x *= aspect; + ndcEnd.x *= aspect; + + // Calculate arrow direction in NDC + float32_t2 arrowVec = ndcEnd - ndcStart; + float32_t arrowNDCLength = length(arrowVec); + + // Skip if arrow is too small on screen + if (arrowNDCLength < 0.005) + return result; + + // Calculate perpendicular distance to line segment in NDC space + float32_t2 toPixel = ndcPos - ndcStart; + float32_t t_ndc = saturate(dot(toPixel, arrowVec) / dot(arrowVec, arrowVec)); + + // Draw line shaft + float32_t lineThickness = 0.002; + float32_t lineIntensity = lineSegment(ndcPos, ndcStart, ndcEnd, lineThickness); + + // Calculate perspective-correct depth + if (lineIntensity > 0.0) + { + float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); + float32_t depthNDC = clipPos.z / clipPos.w; + result.depth = 1.0f - depthNDC; + + if (result.depth < 0.0 || result.depth > 1.0) + lineIntensity = 0.0; + } + + // Modulate by PDF + float32_t pdfIntensity = saturate(pdf * 0.5); + float32_t3 finalColor = float32_t3(pdfIntensity, pdfIntensity, pdfIntensity); + + result.color = float32_t4(finalColor, lineIntensity); + return result; + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl new file mode 100644 index 000000000..edaaa929d --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -0,0 +1,201 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#pragma shader_stage(compute) + +#include "app_resources/hlsl/common.hlsl" +#include "app_resources/hlsl/benchmark/common.hlsl" +#include "app_resources/hlsl/silhouette.hlsl" +#include "app_resources/hlsl/parallelogram_sampling.hlsl" +#include "app_resources/hlsl/pyramid_sampling.hlsl" +#include "app_resources/hlsl/triangle_sampling.hlsl" +#include "app_resources/hlsl/obb_face_sampling.hlsl" + +using namespace nbl::hlsl; + +[[vk::binding(0, 0)]] RWByteAddressBuffer outputBuffer; +[[vk::push_constant]] BenchmarkPushConstants pc; + +static const SAMPLING_MODE_FLAGS benchmarkMode = SAMPLING_MODE_FLAGS_CONST; + +float32_t2 stratifiedXi(uint32_t sampleIdx, uint32_t threadIdx) +{ + return float32_t2( + (float32_t(sampleIdx & 7u) + 0.5f) / 8.0f + float32_t(threadIdx) * 1e-9f, + (float32_t(sampleIdx >> 3u) + 0.5f) / 8.0f + float32_t(threadIdx) * 1e-9f); +} + +// Per-thread input perturbation: scatters threads across the 27 OBB regions and +// generates a fresh OBBView per outer-loop iteration so creation work can't be +// hoisted out by the compiler. Returns just the view; callers build their own +// ClippedSilhouette + materialized verts from it as needed. +shapes::OBBView makePerturbedView(float32_t3 baseOffset, NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32) +{ + const float32_t3 cJ = float32_t3( + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f); + float32_t3x4 cM = pc.modelMatrix; + cM[0][3] += baseOffset.x + cJ.x; + cM[1][3] += baseOffset.y + cJ.y; + cM[2][3] += baseOffset.z + cJ.z; + return shapes::OBBView::create(cM); +} + +// Shared create-and-sample loop for any sampler with the standard +// `create(silhouette, view)` + `generate/forwardPdf/selectedIdx(cache)` shape. +// XORs all outputs into the returned sink to defeat DCE. +template +uint32_t runCreateAndSample(uint32_t creations, NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32, uint32_t invocationID, float32_t3 rndOffset) +{ + uint32_t sink = 0; + for (uint32_t c = 0; c < creations; c++) + { + shapes::OBBView view = makePerturbedView(rndOffset, rng, rcpU32); + ClippedSilhouette silhouette = ClippedSilhouette::create(view, pc.shadingPoint); + SamplerT sampler = SamplerT::create(silhouette, view); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + typename SamplerT::cache_type cache; + float32_t3 dir = sampler.generate(xi, cache); + float32_t pdf = sampler.forwardPdf(xi, cache); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ sampler.selectedIdx(cache); + } + } + return sink; +} + +// Variant for samplers whose `create(view)` works directly from the OBBView +// without needing a ClippedSilhouette upstream. Skips the ~25-30 ps silhouette +// build cost per creation. +template +uint32_t runCreateAndSampleNoSilhouette(uint32_t creations, NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32, uint32_t invocationID, float32_t3 rndOffset) +{ + uint32_t sink = 0; + for (uint32_t c = 0; c < creations; c++) + { + shapes::OBBView view = makePerturbedView(rndOffset, rng, rcpU32); + SamplerT sampler = SamplerT::create(view, pc.shadingPoint); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + typename SamplerT::cache_type cache; + float32_t3 dir = sampler.generate(xi, cache); + float32_t pdf = sampler.forwardPdf(xi, cache); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ sampler.selectedIdx(cache); + } + } + return sink; +} + +// Pyramid-create-only benchmark using synthetic random vertices. Templated on +// UseCaliper so PYRAMID_CREATION_ONLY and CALIPER_PYRAMID_CREATION_ONLY share +// one body. Inner sampler is unused (no generate() calls), so default to SphRect. +template +uint32_t runPyramidCreationOnly(NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32) +{ + typedef SphericalPyramid > PyramidT; + uint32_t sink = 0; + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t3 synthVerts[MAX_SILHOUETTE_VERTICES]; + NBL_UNROLL + for (uint32_t init = 0; init < MAX_SILHOUETTE_VERTICES; init++) + synthVerts[init] = float32_t3(0, 0, 0); + const uint32_t synthCount = 5; + + for (uint32_t v = 0; v < synthCount; v++) + { + float32_t x = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; + float32_t y = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; + // Diagnostic raw-rng sink: forces rng+normalize cost into the timing + // even if the entire pyramid create() gets DCE'd downstream. + sink ^= asuint(x) ^ asuint(y); + synthVerts[v] = normalize(float32_t3(x, y, 1.0f)); + sink ^= asuint(synthVerts[v].x) ^ asuint(synthVerts[v].y) ^ asuint(synthVerts[v].z); + } + + float32_t2 dummyR0, dummyExt; + PyramidT pyramid = PyramidT::createFromVertices(synthVerts, synthCount, dummyR0, dummyExt); + + const float32_t3 axis3 = pyramid.getAxis3(); + sink ^= asuint(pyramid.axis1.x) ^ asuint(pyramid.axis1.y) ^ asuint(pyramid.axis1.z); + sink ^= asuint(pyramid.axis2.x) ^ asuint(pyramid.axis2.y) ^ asuint(pyramid.axis2.z); + sink ^= asuint(axis3.x) ^ asuint(axis3.y) ^ asuint(axis3.z); + NBL_UNROLL + for (uint32_t e = 0; e < 5; e++) + { + const float32_t3 n = pyramid.silEdgeNormals.edgeNormals[e]; + sink ^= asuint(n.x) ^ asuint(n.y) ^ asuint(n.z); + } + } + return sink; +} + +[numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] +void main() +{ + const uint32_t invocationID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + + Xoroshiro64Star rng = Xoroshiro64Star::construct(uint32_t2(invocationID.x + 0x9e3779b9u, invocationID.x * 0x85ebca77u + 1u)); + const float32_t rcpU32 = 1.0f / 4294967296.0f; + const float32_t3 rndOffset = float32_t3( + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f); + + // XOR sink: every output XORs into this to prevent DCE. + uint32_t sink = 0; + + bool sampleValid; + + // Sampling modes use a nested loop: outer iterates over `creations`, inner over + // `samplesPerCreation`. Total samples per thread = sampleCount. + const uint32_t creations = pc.sampleCount / pc.samplesPerCreation; + + if (benchmarkMode == SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY) + { + // Measure full silhouette-prep cost = create + materialize. The previous + // ClippedSilhouette did both inline; the metadata-only ClippedSilhouette + // splits them, so we exercise both here to keep this benchmark + // apples-to-apples. + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + shapes::OBBView iterView = makePerturbedView(rndOffset, rng, rcpU32); + ClippedSilhouette iterSilhouette = ClippedSilhouette::create(iterView, pc.shadingPoint); + float32_t3 iterVerts[MAX_SILHOUETTE_VERTICES]; + iterSilhouette.materialize(iterView, iterVerts); + + sink ^= iterSilhouette.count; + NBL_UNROLL + for (uint32_t j = 0; j < MAX_SILHOUETTE_VERTICES; j++) + sink ^= asuint(iterVerts[j].x) ^ asuint(iterVerts[j].y) ^ asuint(iterVerts[j].z); + } + } + else if ((benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_PYRAMID) != 0u && (benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CREATE_ONLY) != 0u) + sink ^= runPyramidCreationOnly<(benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CALIPER) != 0u>(rng, rcpU32); + // Caliper variant: tighter rect → different rejection rate, only interesting when samplesPerCreation > 1. + else if (benchmarkMode == SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID) + sink ^= runCreateAndSample > >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID) + sink ^= runCreateAndSample > >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID) + sink ^= runCreateAndSample > >(creations, rng, rcpU32, invocationID, rndOffset); + else if ((benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_TRIANGLE) != 0u) + sink ^= runCreateAndSample >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + sink ^= runCreateAndSample(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID) + sink ^= runCreateAndSample >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT) + sink ^= runCreateAndSampleNoSilhouette(creations, rng, rcpU32, invocationID, rndOffset); + else + { + assert(false); + } + const uint32_t offset = sizeof(uint32_t) * invocationID.x; + outputBuffer.Store(offset, sink); +} diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl new file mode 100644 index 000000000..c3fa6db7c --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl @@ -0,0 +1,10 @@ +//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h + +#include + +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_X = 64u; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y = 1u; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z = 1u; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 4096u; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl new file mode 100644 index 000000000..bb260abfe --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -0,0 +1,208 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +#define MAX_SILHOUETTE_VERTICES 7 + +namespace nbl +{ +namespace hlsl +{ + +// Sampling mode enum -- bit-encoded: low byte is the dense ID (0..Count-1), +// high bits are family/variant flags so callers can do `mode & FLAG_X` instead +// of long `||` chains. Host C++ that needs a dense index wraps mode access +// with `(uint32_t(mode) & DENSE_ID_MASK)`. +enum SAMPLING_MODE_FLAGS : uint32_t +{ + // ---- family flags (which underlying geometry/sampler family) ---- + FLAG_PYRAMID = 0x100, + FLAG_TRIANGLE = 0x200, + FLAG_PARALLELOGRAM = 0x400, + FLAG_SILHOUETTE = 0x800, + FLAG_OBB_FACE = 0x10000, + FLAG_OBB_AXES = 0x20000, + + // ---- variant flags (modifiers on the family) ---- + FLAG_CALIPER = 0x1000, + FLAG_PROJECTED = 0x2000, + FLAG_BILINEAR = 0x4000, + FLAG_CREATE_ONLY = 0x8000, + + // ---- dense-ID extractor for host-side array indexing ---- + DENSE_ID_MASK = 0xFF, + + // ---- modes: dense ID in low byte | family/variant flags ---- + SPH_RECT_FROM_CALIPER_PYRAMID = 0 | FLAG_PYRAMID | FLAG_CALIPER, + SPH_RECT_FROM_PYRAMID = 1 | FLAG_PYRAMID, + PROJ_SPH_RECT_FROM_PYRAMID = 2 | FLAG_PYRAMID | FLAG_PROJECTED, + + TRIANGLE_SOLID_ANGLE = 3 | FLAG_TRIANGLE, + TRIANGLE_PROJECTED_SOLID_ANGLE = 4 | FLAG_TRIANGLE | FLAG_PROJECTED, + + PROJECTED_PARALLELOGRAM_SOLID_ANGLE = 5 | FLAG_PARALLELOGRAM, + + BILINEAR_FROM_PYRAMID = 6 | FLAG_PYRAMID | FLAG_BILINEAR, + + OBB_FACE_DIRECT = 7 | FLAG_OBB_FACE, + + SILHOUETTE_CREATION_ONLY = 8 | FLAG_SILHOUETTE | FLAG_CREATE_ONLY, + PYRAMID_CREATION_ONLY = 9 | FLAG_PYRAMID | FLAG_CREATE_ONLY, + CALIPER_PYRAMID_CREATION_ONLY = 10 | FLAG_PYRAMID | FLAG_CALIPER | FLAG_CREATE_ONLY, + + Count = 11, // count of distinct dense IDs + CountWithoutCreateOnly = Count - 3 // count of modes that aren't "creation only" (i.e. that produce samples) +}; + +#ifndef __HLSL_VERSION +// Host helpers: dense IDs for array indexing + a parallel array for combo/iteration. +inline uint32_t denseIdOf(SAMPLING_MODE_FLAGS m) { return uint32_t(m) & uint32_t(SAMPLING_MODE_FLAGS::DENSE_ID_MASK); } + +constexpr SAMPLING_MODE_FLAGS kAllModes[SAMPLING_MODE_FLAGS::Count] = { + SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID, // dense 0 + SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID, // dense 1 + SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID, // dense 2 + SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE, // dense 3 + SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE, // dense 4 + SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE, // dense 5 + SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID, // dense 6 + SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT, // dense 7 + SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY, // dense 8 + SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY, // dense 9 + SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY, // dense 10 +}; +#endif + +struct ResultData +{ + struct SilhouetteData + { + uint32_t3 region; + uint32_t silhouetteIndex; + uint32_t silhouetteVertexCount; + uint32_t silhouette; + uint32_t vertices[6]; + + // Clipping + uint32_t clipMask; + uint32_t clipCount; + uint32_t rotatedClipMask; + uint32_t rotateAmount; + uint32_t positiveVertCount; + uint32_t wrapAround; + uint32_t rotatedSil; + uint32_t edgeVisibilityMismatch; + + // Clipped output: positions written via DebugRecorder::recordClippedVertex + // by callers that materialize silhouette vertices; indices recorded in parallel. + float32_t3 clippedVertices[MAX_SILHOUETTE_VERTICES]; + uint32_t clippedVertexCount; + uint32_t clippedVertexIndices[MAX_SILHOUETTE_VERTICES]; + } silhouette; + + struct TriangleFanData + { + uint32_t maxTrianglesExceeded; + uint32_t sphericalLuneDetected; + uint32_t triangleCount; + float32_t solidAngles[5]; + float32_t totalSolidAngles; + } triangleFan; + + struct ParallelogramData + { + float32_t2 corners[4]; + uint32_t edgeIsConvex[4]; + uint32_t n3Mask; + uint32_t doesNotBound; + uint32_t failedVertexIndex; + uint32_t verticesInside; + uint32_t edgesInside; + float32_t area; + } parallelogram; + + struct PyramidData + { + float32_t3 axis1; // First caliper axis direction + float32_t3 axis2; // Second caliper axis direction + float32_t3 center; // Silhouette center direction + float32_t halfWidth1; // Half-width along axis1 (sin-space) + float32_t halfWidth2; // Half-width along axis2 (sin-space) + float32_t offset1; // Center offset along axis1 + float32_t offset2; // Center offset along axis2 + float32_t solidAngle; // Bounding region solid angle + uint32_t bestEdge; // Which edge produced best caliper + float32_t min1; // Min dot product along axis1 + float32_t max1; // Max dot product along axis1 + float32_t min2; // Min dot product along axis2 + float32_t max2; // Max dot product along axis2 + uint32_t axis2BiggerThanAxis1; + } pyramid; + + struct SamplingData + { + uint32_t sampleCount; + uint32_t validSampleCount; + uint32_t threadCount; // Per-fragment counter, used as divisor for validSampleCount + float32_t4 rayData[512]; // xyz = direction, w = PDF + } sampling; +}; + +struct PushConstants +{ + float32_t3x4 modelMatrix; + float32_t4 viewport; + float32_t3 shadingPoint; + uint32_t sampleCount; + uint32_t frameIndex; +}; + +struct PushConstantRayVis +{ + float32_t4x4 viewProjMatrix; + float32_t3x4 viewMatrix; + float32_t3x4 modelMatrix; + float32_t3x4 invModelMatrix; + float32_t3 shadingPoint; + float32_t4 viewport; + uint32_t frameIndex; +}; + +struct BenchmarkPushConstants +{ + float32_t3x4 modelMatrix; + float32_t3 shadingPoint; + uint32_t sampleCount; // total samples per thread (= creations * samplesPerCreation) + uint32_t samplesPerCreation; // inner-loop count; outer-loop count = sampleCount / samplesPerCreation +}; + +static const float32_t3 colorLUT[27] = { + float32_t3(0, 0, 0), float32_t3(0.5, 0.5, 0.5), + float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), + float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), + float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), + float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3), + float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), + float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), + float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), + float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1), float32_t3(1, 1, 1)}; + +#ifndef __HLSL_VERSION +static const char* colorNames[27] = {"Black", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", + "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", + "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", + "Tan/Beige", "Dark Brown", "White"}; +#endif // __HLSL_VERSION + +} // namespace hlsl + +} // namespace nbl + +static const nbl::hlsl::float32_t CIRCLE_RADIUS = 0.5f; +static const nbl::hlsl::float32_t INV_CIRCLE_RADIUS = 1.0f / CIRCLE_RADIUS; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl new file mode 100644 index 000000000..96ad9abf3 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl @@ -0,0 +1,140 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_DEBUG_VIS_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_DEBUG_VIS_HLSL_INCLUDED_ + +#include "common.hlsl" + +#ifdef __HLSL_VERSION +[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; +#endif + +struct DebugRecorder +{ +#if DEBUG_DATA + static void recordClippedVertex(uint32_t slot, float32_t3 pos, uint32_t originalIndex) + { + DebugDataBuffer[0].silhouette.clippedVertices[slot] = pos; + DebugDataBuffer[0].silhouette.clippedVertexIndices[slot] = originalIndex; + } + + static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, bool wrapAround, uint32_t rotatedSil) + { + DebugDataBuffer[0].silhouette.clippedVertexCount = vertexCount; + DebugDataBuffer[0].silhouette.clipMask = clipMask; + DebugDataBuffer[0].silhouette.clipCount = clipCount; + DebugDataBuffer[0].silhouette.rotatedClipMask = rotatedClipMask; + DebugDataBuffer[0].silhouette.rotateAmount = rotateAmount; + DebugDataBuffer[0].silhouette.positiveVertCount = positiveCount; + DebugDataBuffer[0].silhouette.wrapAround = (uint32_t)wrapAround; + DebugDataBuffer[0].silhouette.rotatedSil = rotatedSil; + } + + static void recordTriangleFan(bool luneDetected, uint32_t count, float32_t totalWeight, float32_t solidAngles[5]) + { + DebugDataBuffer[0].triangleFan.sphericalLuneDetected = (uint32_t)luneDetected; + DebugDataBuffer[0].triangleFan.maxTrianglesExceeded = (count > 5); + DebugDataBuffer[0].triangleFan.triangleCount = count; + DebugDataBuffer[0].triangleFan.totalSolidAngles = totalWeight; + for (uint32_t tri = 0; tri < count; tri++) + DebugDataBuffer[0].triangleFan.solidAngles[tri] = solidAngles[tri]; + } + + static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) + { + DebugDataBuffer[0].parallelogram.area = area; + + // Store per-edge convex and N3 flags + DebugDataBuffer[0].parallelogram.n3Mask = n3Mask; + for (uint32_t i = 0; i < 4; i++) + DebugDataBuffer[0].parallelogram.edgeIsConvex[i] = (convexMask >> i) & 1u; + + // Compute and store the 4 parallelogram corners in circle-space + float32_t2 perpDir = float32_t2(-axisDir.y, axisDir.x); + DebugDataBuffer[0].parallelogram.corners[0] = corner; + DebugDataBuffer[0].parallelogram.corners[1] = corner + width * axisDir; + DebugDataBuffer[0].parallelogram.corners[2] = corner + width * axisDir + height * perpDir; + DebugDataBuffer[0].parallelogram.corners[3] = corner + height * perpDir; + } + + static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) + { + DebugDataBuffer[0].pyramid.axis1 = axis1; + DebugDataBuffer[0].pyramid.axis2 = axis2; + DebugDataBuffer[0].pyramid.center = normalize(center); + DebugDataBuffer[0].pyramid.halfWidth1 = (atan(bounds.z) - atan(bounds.x)) * 0.5f; + DebugDataBuffer[0].pyramid.halfWidth2 = (atan(bounds.w) - atan(bounds.y)) * 0.5f; + DebugDataBuffer[0].pyramid.solidAngle = solidAngle; + DebugDataBuffer[0].pyramid.bestEdge = bestEdge; + DebugDataBuffer[0].pyramid.min1 = bounds.x; + DebugDataBuffer[0].pyramid.max1 = bounds.z; + DebugDataBuffer[0].pyramid.min2 = bounds.y; + DebugDataBuffer[0].pyramid.max2 = bounds.w; + } + + static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) { DebugDataBuffer[0].sampling.rayData[i] = float32_t4(dir, pdf); } + + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount, uint32_t sampleCount) + { + DebugDataBuffer[0].silhouette.region = region; + DebugDataBuffer[0].silhouette.silhouetteIndex = configIndex; + DebugDataBuffer[0].silhouette.silhouetteVertexCount = silSize; + for (uint32_t i = 0; i < 6; i++) + DebugDataBuffer[0].silhouette.vertices[i] = vertexIndices[i]; + DebugDataBuffer[0].silhouette.silhouette = silData; + + InterlockedAdd(DebugDataBuffer[0].sampling.validSampleCount, validSampleCount); + InterlockedAdd(DebugDataBuffer[0].sampling.threadCount, 1u); + DebugDataBuffer[0].sampling.sampleCount = sampleCount; + } +#else + static void recordClippedVertex(uint32_t slot, float32_t3 pos, uint32_t originalIndex) {} + static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, bool wrapAround, uint32_t rotatedSil) {} + static void recordTriangleFan(bool luneDetected, uint32_t count, float32_t totalWeight, float32_t solidAngles[5]) {} + static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) {} + static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) {} + static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) {} + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount, uint32_t sampleCount) {} +#endif +}; + +// Module-scope visualization state (per-thread in fragment shaders) +#if VISUALIZE_SAMPLES +static float32_t2 g_visNdc; +static float32_t3 g_visSpherePos; +static float32_t g_visAaWidth; +static float32_t4 g_visColor; +#endif + +struct VisContext +{ +#if VISUALIZE_SAMPLES + static void begin(float32_t2 ndc, float32_t3 spherePos, float32_t _aaWidth) + { + g_visNdc = ndc; + g_visSpherePos = spherePos; + g_visAaWidth = _aaWidth; + g_visColor = float32_t4(0, 0, 0, 0); + } + + static void add(float32_t4 c) { g_visColor += c; } + static float32_t4 flush() { return g_visColor; } + + static float32_t2 ndc() { return g_visNdc; } + static float32_t3 spherePos() { return g_visSpherePos; } + static float32_t aaWidth() { return g_visAaWidth; } + static bool enabled() { return true; } +#else + static void begin(nbl::hlsl::float32_t2 ndc, nbl::hlsl::float32_t3 spherePos, nbl::hlsl::float32_t aaWidth) {} + static void add(nbl::hlsl::float32_t4 c) {} + static nbl::hlsl::float32_t4 flush() { return nbl::hlsl::float32_t4(0, 0, 0, 0); } + + static nbl::hlsl::float32_t2 ndc() { return nbl::hlsl::float32_t2(0, 0); } + static nbl::hlsl::float32_t3 spherePos() { return nbl::hlsl::float32_t3(0, 0, 0); } + static nbl::hlsl::float32_t aaWidth() { return 0; } + static bool enabled() { return false; } +#endif +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_DEBUG_VIS_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl new file mode 100644 index 000000000..8e40ee522 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl @@ -0,0 +1,181 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_OBB_FACE_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_OBB_FACE_SAMPLING_HLSL_INCLUDED_ + +#include "common.hlsl" +#include "silhouette.hlsl" // for the (silhouette, view) overload's signature + +#include +#include +#include +#include +#include + +// Multi-face OBB sampler -- Matt's design with shared tip vertex T as origin +// and silhouette pipeline skipped entirely. NO horizon clipping (option A): +// samples below z=0 just get pdf=0, biased for OBBs near receiver horizon. +// +// This is the best OBB-faces variant we measured (~92 ps @ 1:1, ~22 ps @ 1:16, +// ~17 ps @ 1:128). Still slower than PYRAMID_RECTANGLE on this Ampere SM at +// every ratio. Kept around as a documented baseline for future experiments +// (e.g. Las Vegas resampling, different inner samplers, fp16 packing) where +// the no-clipping property might justify the per-sample overhead. +// +// See feedback memory: feedback_obb_faces_direct_loses.md +struct OBBFaceSampler +{ + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + struct cache_type + { + typename sampling::SphericalRectangle::cache_type inner; + density_type pdf; + }; + + sampling::SphericalRectangle rects[3]; + uint32_t numRects; + float32_t cumSA0; + float32_t cumSA1; + float32_t totalSolidAngle; + float32_t rcpTotalSolidAngle; + + // Build sphrect for face on `Axis`, using T as the shared world-space origin. + // T_idx encodes which OBB cube corner T is (bits 0/1/2 = axis sides). + // swap flips right/up for correct outward-normal direction; rule is + // popcount(T_idx) even => swap. + template + static sampling::SphericalRectangle makeRectFromTip(shapes::OBBView view, float32_t3 T_pos, uint32_t T_idx, bool swap) + { + const uint32_t a1 = (Axis + 1u) % 3u; + const uint32_t a2 = (Axis + 2u) % 3u; + + const float32_t s1 = ((T_idx & (1u << a1)) != 0u) ? -1.0f : 1.0f; + const float32_t s2 = ((T_idx & (1u << a2)) != 0u) ? -1.0f : 1.0f; + const float32_t3 rNatural = view.columns[a1] * s1; + const float32_t3 uNatural = view.columns[a2] * s2; + + shapes::CompressedSphericalRectangle compressed; + compressed.origin = T_pos; + if (swap) + { + compressed.right = uNatural; + compressed.up = rNatural; + } + else + { + compressed.right = rNatural; + compressed.up = uNatural; + } + + const shapes::SphericalRectangle shapeRect = shapes::SphericalRectangle::create(compressed); + return sampling::SphericalRectangle::create(shapeRect, float32_t3(0.0f, 0.0f, 0.0f)); + } + + // create(view, shadingPoint) -- region derived inline from view, no silhouette pipeline. + static OBBFaceSampler create(shapes::OBBView view, float32_t3 shadingPoint) + { + OBBFaceSampler self; + + // Region inline (mirrors silhouette.hlsl ClippedSilhouette::create); all + // in shading-point-relative coords. + const float32_t3 toMin = view.minCorner - shadingPoint; + const float32_t3 sqScales = float32_t3(dot(view.columns[0], view.columns[0]), dot(view.columns[1], view.columns[1]), dot(view.columns[2], view.columns[2])); + const float32_t3 proj = -float32_t3(dot(view.columns[0], toMin), dot(view.columns[1], toMin), dot(view.columns[2], toMin)); + const uint32_t3 below = uint32_t3(proj < float32_t3(0, 0, 0)); + const uint32_t3 above = uint32_t3(proj > sqScales); + const uint32_t3 region = uint32_t3(uint32_t3(1u, 1u, 1u) + below - above); + + const bool xVis = (region.x != 1u); + const bool yVis = (region.y != 1u); + const bool zVis = (region.z != 1u); + self.numRects = uint32_t(xVis) + uint32_t(yVis) + uint32_t(zVis); + + // Tip T: bit i set iff observer past max on axis i (region[i] == 0). + const uint32_t T_idx = (uint32_t(region.x == 0u) << 0) + | (uint32_t(region.y == 0u) << 1) + | (uint32_t(region.z == 0u) << 2); + const float32_t3 T_pos = view.getVertex(T_idx) - shadingPoint; + + const bool swap = (countbits(T_idx) & 1u) == 0u; + + // Slot 0: first visible axis. Cascade keeps every rects[K] write at a + // literal slot index, every makeRectFromTip at literal Axis. + if (xVis) + self.rects[0] = makeRectFromTip<0>(view, T_pos, T_idx, swap); + else if (yVis) + self.rects[0] = makeRectFromTip<1>(view, T_pos, T_idx, swap); + else + self.rects[0] = makeRectFromTip<2>(view, T_pos, T_idx, swap); + + // Slot 1: second visible. xVis && yVis -> y; otherwise z. + if (self.numRects >= 2u) + { + if (xVis && yVis) + self.rects[1] = makeRectFromTip<1>(view, T_pos, T_idx, swap); + else + self.rects[1] = makeRectFromTip<2>(view, T_pos, T_idx, swap); + } + + // Slot 2: only when all 3 visible -> axis z. + if (self.numRects == 3u) + self.rects[2] = makeRectFromTip<2>(view, T_pos, T_idx, swap); + + // CDF over face solid angles. + self.cumSA0 = self.rects[0].solidAngle; + self.cumSA1 = self.cumSA0 + ((self.numRects >= 2u) ? self.rects[1].solidAngle : 0.0f); + self.totalSolidAngle = self.cumSA1 + ((self.numRects == 3u) ? self.rects[2].solidAngle : 0.0f); + self.rcpTotalSolidAngle = 1.0f / self.totalSolidAngle; + + return self; + } + + // Uniform interface compatibility: ignores `silhouette`'s geometry (region + // is derived inline from view) but reads its baked-in shadingPoint so the + // sampler agrees with the silhouette's classification frame. + static OBBFaceSampler create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, shapes::OBBView view) + { + return create(view, silhouette.shadingPoint); + } + + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + const float32_t target = u.x * totalSolidAngle; + codomain_type dir; + + if (target < cumSA0) + { + const float32_t uPrime = target / cumSA0; + dir = rects[0].generate(float32_t2(uPrime, u.y), cache.inner); + } + else if (numRects == 2u || target < cumSA1) + { + const float32_t faceSA = (numRects == 2u) ? (totalSolidAngle - cumSA0) : (cumSA1 - cumSA0); + const float32_t uPrime = (target - cumSA0) / faceSA; + dir = rects[1].generate(float32_t2(uPrime, u.y), cache.inner); + } + else // numRects == 3 and target >= cumSA1 + { + const float32_t faceSA = totalSolidAngle - cumSA1; + const float32_t uPrime = (target - cumSA1) / faceSA; + dir = rects[2].generate(float32_t2(uPrime, u.y), cache.inner); + } + + const bool valid = dir.z > 0.0f; + cache.pdf = hlsl::select(valid, rcpTotalSolidAngle, 0.0f); + return dir; + } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0u; } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_OBB_FACE_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl new file mode 100644 index 000000000..1751f1524 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl @@ -0,0 +1,496 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ + +#include +#include +#include "silhouette.hlsl" +#include "drawing.hlsl" + +#define MAX_CURVE_APEXES 2 +#define GET_PROJ_VERT(i) vertices[i].xy *CIRCLE_RADIUS + +// ============================================================================ +// Minimum bounding rectangle on projected sphere +// +// All internal helpers operate on a pre-materialized + pre-normalized vertex +// array `verts[7]`. The factory `create(silhouette)` materializes verts +// locally via the silhouette's +/- walk (using its stored view) and absorbs +// SilEdgeNormals as a member so sample(xi, pdf) needs no extra args. +// ============================================================================ +struct Parallelogram +{ + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + // Cache for the TractableSampler concept: stores enough state from + // generate() that forwardPdf()/forwardWeight() are O(1) lookups instead + // of redoing the inside test. selectedIdx is unused for Parallelogram + // (no subdivision) but kept for uniform extraction by visualizeSample(). + struct cache_type + { + density_type pdf; + }; + + float16_t2 corner; + float16_t2 axisDir; + float16_t width; + float16_t height; + SilEdgeNormals normals; // per-edge cross products in world frame for the inside test in sample() + + // ======================================================================== + // Projection helpers + // ======================================================================== + + static float32_t3 circleToSphere(float32_t2 circlePoint) + { + float32_t2 xy = circlePoint * INV_CIRCLE_RADIUS; + float32_t xy_len_sq = dot(xy, xy); + return float32_t3(xy, sqrt(1.0f - xy_len_sq)); + } + + // ======================================================================== + // Curve evaluation helpers + // ======================================================================== + + static float32_t2 evalCurvePoint(float32_t3 S, float32_t3 E, float32_t t) + { + float32_t3 v = S + t * (E - S); + float32_t invLen = rsqrt(dot(v, v)); + return v.xy * (invLen * CIRCLE_RADIUS); + } + + static float32_t2 evalCurveTangent(float32_t3 S, float32_t3 E, float32_t t) + { + float32_t3 v = S + t * (E - S); + float32_t vLenSq = dot(v, v); + + if (vLenSq < 1e-12f) + return normalize(E.xy - S.xy); + + float32_t3 p = v * rsqrt(vLenSq); + float32_t3 vPrime = E - S; + float32_t2 tangent2D = (vPrime - p * dot(p, vPrime)).xy; + + float32_t len = length(tangent2D); + return (len > 1e-7f) ? tangent2D / len : normalize(E.xy - S.xy); + } + + // Get both endpoint tangents (shares SdotE computation) + static void getProjectedTangents(float32_t3 S, float32_t3 E, out float32_t2 t0, out float32_t2 t1) + { + float32_t SdotE = dot(S, E); + + float32_t2 tangent0_2D = (E - S * SdotE).xy; + float32_t2 tangent1_2D = (E * SdotE - S).xy; + + float32_t len0Sq = dot(tangent0_2D, tangent0_2D); + float32_t len1Sq = dot(tangent1_2D, tangent1_2D); + + const float32_t eps = 1e-14f; + + if (len0Sq > eps && len1Sq > eps) + { + t0 = tangent0_2D * rsqrt(len0Sq); + t1 = tangent1_2D * rsqrt(len1Sq); + return; + } + + // Rare fallback path + float32_t2 diff = E.xy - S.xy; + float32_t diffLenSq = dot(diff, diff); + float32_t2 fallback = diffLenSq > eps ? diff * rsqrt(diffLenSq) : float32_t2(1.0f, 0.0f); + + t0 = len0Sq > eps ? tangent0_2D * rsqrt(len0Sq) : fallback; + t1 = len1Sq > eps ? tangent1_2D * rsqrt(len1Sq) : fallback; + } + + // Compute apex with clamping to prevent apex explosion + static void computeApexClamped(float32_t2 p0, float32_t2 p1, float32_t2 t0, float32_t2 t1, out float32_t2 apex) + { + float32_t denom = t0.x * t1.y - t0.y * t1.x; + float32_t2 center = (p0 + p1) * 0.5f; + + if (abs(denom) < 1e-6f) + { + apex = center; + return; + } + + float32_t2 dp = p1 - p0; + float32_t s = (dp.x * t1.y - dp.y * t1.x) / denom; + apex = p0 + s * t0; + + float32_t2 toApex = apex - center; + float32_t distSq = dot(toApex, toApex); + float32_t maxDistSq = CIRCLE_RADIUS * CIRCLE_RADIUS * 4.0f; + + if (distSq > maxDistSq) + { + apex = center + toApex * (CIRCLE_RADIUS * 2.0f * rsqrt(distSq)); + } + } + + // ======================================================================== + // Bounding box computation (rotating calipers) + // + // testEdgeForAxis and computeBoundsForAxis are + // templated on a bool to select between two precision levels: + // + // Accurate=false (used by tryCaliperDir, O(N^2) total calls): + // Tests vertices + edge midpoints only. Cheap (just dot products) and + // sufficient for *ranking* candidate axes, even though it may + // underestimate the true extent of convex edges. + // + // Accurate=true (used by buildForAxis, called once): + // Also computes tangent-line apex intersections for convex edges to + // find the true extremum. Great circle arcs that project as convex + // curves can bulge beyond their endpoints; the apex (tangent + // evaluation + line intersection + clamping) captures this but is + // ~4x more expensive per edge. + // + // The fast path gives the same relative ranking of axes (the + // approximation error is consistent across candidates), so the + // cheapest axis found by Fast is also the cheapest under Accurate. + // ======================================================================== + + static void testPoint(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t2 pt, float32_t2 dir, float32_t2 perpDir) + { + float32_t projAlong = dot(pt, dir); + float32_t projPerp = dot(pt, perpDir); + + minAlong = min(minAlong, projAlong); + maxAlong = max(maxAlong, projAlong); + minPerp = min(minPerp, projPerp); + maxPerp = max(maxPerp, projPerp); + } + + // Accurate=false (Fast): tests vertex + midpoint only. Used O(N^2) times for axis ranking. + // Accurate=true: also computes tangent-line apex for convex edges. Used once for final rect. + template + static void testEdgeForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) + { + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + const float32_t2 projectedVertex = GET_PROJ_VERT(I); + + testPoint(minAlong, maxAlong, minPerp, maxPerp, projectedVertex, dir, perpDir); + + bool isN3 = (n3Mask & (1u << I)) != 0; + + if (Accurate) + { + bool isConvex = (convexMask & (1u << I)) != 0; + + if (!isN3 && !isConvex) + return; + + float32_t3 S = vertices[I]; + float32_t3 E = vertices[nextIdx]; + float32_t2 midPoint = evalCurvePoint(S, E, 0.5f); + + if (isN3) + { + testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, dir, perpDir); + } + + if (isConvex) + { + float32_t2 t0, endTangent; + getProjectedTangents(S, E, t0, endTangent); + + if (dot(t0, perpDir) > 0.0f) + { + float32_t2 apex0; + if (isN3) + { + float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); + computeApexClamped(projectedVertex, midPoint, t0, tangentAtMid, apex0); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, dir, perpDir); + + if (dot(tangentAtMid, perpDir) > 0.0f) + { + float32_t2 apex1; + computeApexClamped(midPoint, E.xy * CIRCLE_RADIUS, tangentAtMid, endTangent, apex1); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex1, dir, perpDir); + } + } + else + { + computeApexClamped(projectedVertex, E.xy * CIRCLE_RADIUS, t0, endTangent, apex0); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, dir, perpDir); + } + } + } + } + else + { + if (isN3) + { + float32_t2 midPoint = evalCurvePoint(vertices[I], vertices[nextIdx], 0.5f); + testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, dir, perpDir); + } + } + } + + // Unrolled bounding box computation for a given axis direction. + // Accurate=false: fast path for axis ranking during candidate selection. + // Accurate=true: tight bounds with apex computation for the final rectangle. + template + static void computeBoundsForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) + { + testEdgeForAxis<0, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<1, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<2, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 3) + { + testEdgeForAxis<3, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 4) + { + testEdgeForAxis<4, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 5) + { + testEdgeForAxis<5, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 6) + { + testEdgeForAxis<6, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + } + } + } + } + } + + static void tryCaliperDir(inout float32_t bestArea, inout float32_t2 bestDir, const float32_t2 dir, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t n3Mask) + { + float32_t2 perpDir = float32_t2(-dir.y, dir.x); + + float32_t minAlong = 1e10f; + float32_t maxAlong = -1e10f; + float32_t minPerp = 1e10f; + float32_t maxPerp = -1e10f; + + computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, vertices, count, 0, n3Mask, dir, perpDir); + + float32_t area = (maxAlong - minAlong) * (maxPerp - minPerp); + if (area < bestArea) + { + bestArea = area; + bestDir = dir; + } + } + + template + static void processEdge(inout float32_t bestArea, inout float32_t2 bestDir, inout uint32_t convexMask, inout uint32_t n3Mask, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, inout SilEdgeNormals precompSil) + { + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + float32_t3 S = vertices[I]; + float32_t3 E = vertices[nextIdx]; + precompSil.edgeNormals[I] = float16_t3(cross(S, E)); + + float32_t2 t0, t1; + getProjectedTangents(S, E, t0, t1); + + tryCaliperDir(bestArea, bestDir, t0, vertices, count, n3Mask); + + if (nbl::hlsl::cross2D(S.xy, E.xy) < -1e-6f) + { + convexMask |= (1u << I); + tryCaliperDir(bestArea, bestDir, t1, vertices, count, n3Mask); + + if (dot(t0, t1) < 0.5f) + { + n3Mask |= (1u << I); + float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); + tryCaliperDir(bestArea, bestDir, tangentAtMid, vertices, count, n3Mask); + } + } + } + + // ======================================================================== + // Factory methods + // ======================================================================== + + static Parallelogram buildForAxis(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir) + { + float32_t2 perpDir = float32_t2(-dir.y, dir.x); + + float32_t minAlong = 1e10f; + float32_t maxAlong = -1e10f; + float32_t minPerp = 1e10f; + float32_t maxPerp = -1e10f; + + computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + + Parallelogram result; + result.width = (float16_t)(maxAlong - minAlong); + result.height = (float16_t)(maxPerp - minPerp); + result.axisDir = float16_t2(dir); + result.corner = float16_t2(minAlong * dir + minPerp * perpDir); + + return result; + } + + // Real factory: takes a pre-materialized + pre-normalized vertex array. + // The (silhouette) overload below handles materialization. + static Parallelogram createFromVertices(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count) + { + SilEdgeNormals precompSil = (SilEdgeNormals)0; + + uint32_t convexMask = 0; + uint32_t n3Mask = 0; + float32_t bestArea = 1e10f; + float32_t2 bestDir = float32_t2(1.0f, 0.0f); + + processEdge<0>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + processEdge<1>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + processEdge<2>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 3) + { + processEdge<3>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 4) + { + processEdge<4>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 5) + { + processEdge<5>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 6) + { + processEdge<6>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + } + } + } + } + + tryCaliperDir(bestArea, bestDir, float32_t2(1.0f, 0.0f), vertices, count, n3Mask); + tryCaliperDir(bestArea, bestDir, float32_t2(0.0f, 1.0f), vertices, count, n3Mask); + + Parallelogram best = buildForAxis(vertices, count, convexMask, n3Mask, bestDir); + + // Apex-draw cascade: literal per edge so vertices[I] / vertices[J] + // accesses keep vertices SROA-promoted (a single dynamic-index access here + // would demote the entire SilhouetteVerts to Function memory and tank + // every cascade above this point). + apexDrawEdge<0, 1>(vertices, convexMask, n3Mask); + apexDrawEdge<1, 2>(vertices, convexMask, n3Mask); + if (count == 3) + { + apexDrawEdge<2, 0>(vertices, convexMask, n3Mask); + } + else + { + apexDrawEdge<2, 3>(vertices, convexMask, n3Mask); + if (count == 4) + { + apexDrawEdge<3, 0>(vertices, convexMask, n3Mask); + } + else + { + apexDrawEdge<3, 4>(vertices, convexMask, n3Mask); + if (count == 5) + { + apexDrawEdge<4, 0>(vertices, convexMask, n3Mask); + } + else + { + apexDrawEdge<4, 5>(vertices, convexMask, n3Mask); + if (count == 6) + { + apexDrawEdge<5, 0>(vertices, convexMask, n3Mask); + } + else // count == 7 + { + apexDrawEdge<5, 6>(vertices, convexMask, n3Mask); + apexDrawEdge<6, 0>(vertices, convexMask, n3Mask); + } + } + } + } + DebugRecorder::recordParallelogram(float32_t(best.width) * float32_t(best.height), convexMask, n3Mask, float32_t2(best.corner), float32_t2(best.axisDir), float32_t(best.width), float32_t(best.height)); + + best.normals = precompSil; + return best; + } + + // Per-edge apex-draw helper. Templated so vertices[I] / vertices[J] are + // literal-index reads. Skipped at runtime when the edge isn't convex. + template + static void apexDrawEdge(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t convexMask, uint32_t n3Mask) + { + if ((convexMask & (1u << I)) == 0u) + return; + + const float32_t2 p0 = GET_PROJ_VERT(I); + const float32_t2 p1 = GET_PROJ_VERT(J); + + float32_t2 t0, endTangent; + getProjectedTangents(vertices[I], vertices[J], t0, endTangent); + + if (n3Mask & (1u << I)) + { + const float32_t2 tangentAtMid = evalCurveTangent(vertices[I], vertices[J], 0.5f); + const float32_t2 midPoint = evalCurvePoint(vertices[I], vertices[J], 0.5f); + + float32_t2 apex0, apex1; + computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); + computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); + + VisContext::add(SphereDrawer::drawDot(float32_t3(apex0, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); + VisContext::add(SphereDrawer::drawDot(float32_t3(midPoint, 0.0f), 0.02, 0.0f, float32_t3(0, 1, 0))); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex1, 0.0f), 0.03, 0.0f, float32_t3(1, 0.5, 0))); + } + else + { + float32_t2 apex; + computeApexClamped(p0, p1, t0, endTangent, apex); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); + } + } + + // Convenience overload: materialize + normalize verts on the stack via the + // silhouette's +/- walk, then forward to the real factory. Local verts[7] + // dies when this function returns; the Parallelogram (with its embedded + // edge normals) is the only thing that outlives create(). + static Parallelogram create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, shapes::OBBView view) + { + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; + silhouette.materializeNormalized(view, vertices); + return createFromVertices(vertices, silhouette.count); + } + + // TractableSampler::generate. Maps u in [0,1]^2 to a unit direction on the + // sphere via the orthographically-projected parallelogram, registers the + // pdf in the cache for O(1) forwardPdf, and stamps selectedIdx = 0 (no + // subdivision -- the field exists only for the visualization code path). + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + float16_t2 perpDir = float16_t2(-axisDir.y, axisDir.x); + + float16_t2 circleXY = corner + + (float16_t)(u.x) * width * axisDir + + (float16_t)(u.y) * height * perpDir; + + codomain_type direction = circleToSphere(circleXY); + + bool valid = direction.z > 0.0f && normals.isInside(direction); + // PDF in solid angle measure: the rectangle is in circle-space (scaled by CIRCLE_RADIUS), + // and the orthographic projection Jacobian is dA_circle/dω = CIRCLE_RADIUS^2 * z + cache.pdf = valid ? (CIRCLE_RADIUS * CIRCLE_RADIUS * direction.z / (scalar_type(width) * scalar_type(height))) : 0.0f; + + return direction; + } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0; } +}; + +#undef MAX_CURVE_APEXES +#undef GET_PROJ_VERT + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl new file mode 100644 index 000000000..8d86cc1dc --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl @@ -0,0 +1,150 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ + +// Thin shim over the builtin SphericalPyramid. The builtin (in +// nbl/builtin/hlsl/sampling/spherical_pyramid.hlsl) is the source of truth; +// this file re-exports it at example-global scope, adds a buildInner overload +// for the example-local BilinearSampler, and adds a templated debug+visualize +// helper that re-derives the intermediates the builtin's debug-free +// createFromVertices() doesn't expose. +#include "common.hlsl" + +#include +#include +#include +#include +#include +#include + +#include "silhouette.hlsl" +#include "drawing.hlsl" +#include "pyramid_sampling/bilinear.hlsl" + +// buildInner overload for the example-local BilinearSampler. Lives at global +// namespace so unqualified lookup from SphericalPyramid<_, BilinearSampler>::create +// (which the builtin defines in nbl::hlsl::sampling) finds it at instantiation. +inline BilinearSampler buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, BilinearSampler /*tag*/) +{ + return BilinearSampler::create(basis, r0, ext); +} + +// Re-export at example-global scope so existing SphericalPyramid<...> spellings +// in frag/benchmark/SelectSampler keep compiling without qualification. +template +using SphericalPyramid = nbl::hlsl::sampling::SphericalPyramid; + +// PyramidDebugVis is a no-op for non-pyramid samplers. The pyramid +// specialization re-materializes silhouette verts, recovers (rectR0, rectExtents) +// by re-running computeBound3D against the pyramid's frame, finds the chosen +// edge from the local-frame silEdgeNormals (matches the old findChosenEdge +// heuristic), records DebugRecorder::recordPyramid, and emits the bounding +// great-circle + axes overlay. +template +struct PyramidDebugVis +{ + static void apply(SamplerT /*sampler*/, ClippedSilhouette /*silhouette*/, shapes::OBBView /*view*/) {} +}; + +template +struct PyramidDebugVis > +{ + using PyramidT = SphericalPyramid; + + // Cheap "which edge is most parallel to axis1" heuristic the original + // visualize() used: smallest |edgeNormals[i].x| in the local frame. + // silEdgeNormals are local-frame after createFromVertices transformToLocal. + static uint32_t findChosenEdgeLocal(PyramidT pyramid, uint32_t count) + { + uint32_t bestI = 0; + float32_t bestAbs = abs(pyramid.silEdgeNormals.edgeNormals[0].x); + for (uint32_t i = 0; i < count; i++) + { + const float32_t v = abs(pyramid.silEdgeNormals.edgeNormals[i].x); + const bool better = v < bestAbs; + bestAbs = nbl::hlsl::select(better, v, bestAbs); + bestI = nbl::hlsl::select(better, i, bestI); + } + return bestI; + } + + static void apply(PyramidT pyramid, ClippedSilhouette silhouette, shapes::OBBView view) + { + if (silhouette.count == 0) + return; + + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; + silhouette.materialize(view, vertices); + + const float32_t3 axis3 = pyramid.getAxis3(); + + // Recover (rectR0, rectExtents) from the pyramid frame. + float32_t4 bestBound; + PyramidT::computeBound3D(vertices, silhouette.count, pyramid.axis1, pyramid.axis2, axis3, bestBound); + bestBound.zw = max(bestBound.zw, bestBound.xy + 1e-6f); + const float32_t2 rectR0 = bestBound.xy; + const float32_t2 rectExtents = float32_t2(bestBound.zw - bestBound.xy); + + // 4-edge spherical rectangle solid angle from bounds, for the debug overlay. + const float32_t4 denorm_n_z = float32_t4(-bestBound.y, bestBound.z, bestBound.w, -bestBound.x); + const float32_t4 n_z = denorm_n_z * rsqrt(float32_t4(1.0f, 1.0f, 1.0f, 1.0f) + denorm_n_z * denorm_n_z); + const float32_t4 cosGamma = float32_t4(-n_z[0] * n_z[1], -n_z[1] * n_z[2], -n_z[2] * n_z[3], -n_z[3] * n_z[0]); + math::sincos_accumulator acc = math::sincos_accumulator::create(cosGamma[0]); + acc.addCosine(cosGamma[1]); + acc.addCosine(cosGamma[2]); + acc.addCosine(cosGamma[3]); + const float32_t solidAngle = acc.getSumOfArccos() - 2.0f * numbers::pi; + + // bestEdge identification is post-hoc and approximate (the builtin + // create() doesn't track it). The visualize() overlay's orange highlight + // uses the local-frame |n.x| heuristic that's a reasonable proxy. + const uint32_t bestEdge = findChosenEdgeLocal(pyramid, silhouette.count); + + // Approximate centroid sign for the debug recorder. The original tracked + // -unnormCentroid during processEdge; -axis3 captures its direction. + DebugRecorder::recordPyramid(pyramid.axis1, pyramid.axis2, -axis3, bestBound, solidAngle, bestEdge); + + // Bounding great circles + axis dots overlay. + const float32_t x0 = rectR0.x; + const float32_t x1 = rectR0.x + rectExtents.x; + const float32_t y0 = rectR0.y; + const float32_t y1 = rectR0.y + rectExtents.y; + const float32_t z = 1.0f; + const float32_t3 boundColor1 = float32_t3(1.0f, 0.5f, 0.5f); + const float32_t3 boundColor2 = float32_t3(0.5f, 0.5f, 1.0f); + const float32_t3 centerColor = float32_t3(1.0f, 1.0f, 0.0f); + + const float32_t3 bottomNormalLocal = normalize(float32_t3(0, -z, y0)); + const float32_t3 topNormalLocal = normalize(float32_t3(0, z, -y1)); + const float32_t3 leftNormalLocal = normalize(float32_t3(-z, 0, x0)); + const float32_t3 rightNormalLocal = normalize(float32_t3(z, 0, -x1)); + + const float32_t3 bottomNormal = bottomNormalLocal.x * pyramid.axis1 + bottomNormalLocal.y * pyramid.axis2 + bottomNormalLocal.z * axis3; + const float32_t3 topNormal = topNormalLocal.x * pyramid.axis1 + topNormalLocal.y * pyramid.axis2 + topNormalLocal.z * axis3; + const float32_t3 leftNormal = leftNormalLocal.x * pyramid.axis1 + leftNormalLocal.y * pyramid.axis2 + leftNormalLocal.z * axis3; + const float32_t3 rightNormal = rightNormalLocal.x * pyramid.axis1 + rightNormalLocal.y * pyramid.axis2 + rightNormalLocal.z * axis3; + + const float32_t centerX = (x0 + x1) * 0.5f; + const float32_t centerY = (y0 + y1) * 0.5f; + const float32_t3 centerLocal = normalize(float32_t3(centerX, centerY, z)); + const float32_t3 centerWorld = centerLocal.x * pyramid.axis1 + centerLocal.y * pyramid.axis2 + centerLocal.z * axis3; + + VisContext::add(SphereDrawer::drawCorner(centerWorld, 0.025f, 0.0f, centerColor)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(bottomNormal, axis3, boundColor2, 0.004f)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(topNormal, axis3, boundColor2, 0.004f)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(leftNormal, axis3, boundColor1, 0.004f)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(rightNormal, axis3, boundColor1, 0.004f)); + + const uint32_t bestJ = (bestEdge + 1u) % silhouette.count; + float32_t3 chosen[2] = {vertices[bestEdge], vertices[bestJ]}; + VisContext::add(SphereDrawer::drawEdge(8u, chosen, 0.012f)); // colorLUT[8] = orange + + VisContext::add(SphereDrawer::drawDot(pyramid.axis1, 0.025f, 0.0f, float32_t3(1.0f, 0.0f, 0.0f))); + VisContext::add(SphereDrawer::drawDot(pyramid.axis2, 0.025f, 0.0f, float32_t3(0.0f, 1.0f, 0.0f))); + VisContext::add(SphereDrawer::drawDot(axis3, 0.025f, 0.0f, float32_t3(0.0f, 0.0f, 1.0f))); + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl new file mode 100644 index 000000000..4b0f85cbf --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl @@ -0,0 +1,102 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ +#include + +// Bilinear gnomonic-rect sampler. Stores the pyramid's basis so generate() +// returns world-space dirs (matching SphericalRectangle's contract). +struct BilinearSampler +{ + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using matrix3x3_type = float32_t3x3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + nbl::hlsl::sampling::Bilinear sampler; + matrix3x3_type basis; + float32_t2 rectR0; + float32_t2 rectExtents; + float32_t rcpRectArea; + + struct cache_type + { + nbl::hlsl::sampling::Bilinear::cache_type bilinearCache; + float32_t dist2; + float32_t rcpLen; + }; + + static BilinearSampler create(matrix3x3_type basis, float32_t2 rectR0, float32_t2 rectExtents) + { + BilinearSampler self; + self.basis = basis; + + // 4 corner positions on the rectangle + const float32_t x0 = rectR0.x; + const float32_t x1 = x0 + rectExtents.x; + const float32_t y0 = rectR0.y; + const float32_t y1 = y0 + rectExtents.y; + + // dSA(x,y) = 1 / (x^2 + y^2 + 1)^(3/2) [z = 1.0 in local frame] + const float32_t xx0 = x0 * x0, xx1 = x1 * x1; + const float32_t yy0 = y0 * y0, yy1 = y1 * y1; + + // d^{-3/2} = rsqrt(d)^3: 1 rsqrt + 2 mul instead of 1 rsqrt + 1 div + float32_t r; + r = rsqrt(xx0 + yy0 + 1.0f); + const float32_t v00 = r * r * r; + r = rsqrt(xx1 + yy0 + 1.0f); + const float32_t v10 = r * r * r; + r = rsqrt(xx0 + yy1 + 1.0f); + const float32_t v01 = r * r * r; + r = rsqrt(xx1 + yy1 + 1.0f); + const float32_t v11 = r * r * r; + + // Bilinear layout: (x0y0, x0y1, x1y0, x1y1) + self.sampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); + self.rectR0 = rectR0; + self.rectExtents = rectExtents; + self.rcpRectArea = rcp(max(rectExtents.x * rectExtents.y, 1e-20f)); + + return self; + } + + // Returns world-space unit direction; caches dist2 and rcpLen for forwardPdf. + // Returns local-frame unit direction; caches dist2/rcpLen for forwardPdf. + // hitDist == 1/rcpLen (the gnomonic ray length on the rect at z=1). + codomain_type generateNormalizedLocal(domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) + { + const vector2_type uv = sampler.generate(u, cache.bilinearCache); + const scalar_type localX = rectR0.x + uv.x * rectExtents.x; + const scalar_type localY = rectR0.y + uv.y * rectExtents.y; + cache.dist2 = localX * localX + localY * localY + 1.0f; + cache.rcpLen = rsqrt(cache.dist2); + hitDist = 1.0f / cache.rcpLen; + return codomain_type(localX, localY, 1.0f) * cache.rcpLen; + } + + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + scalar_type dummy; + const vector3_type localDir = generateNormalizedLocal(u, cache, dummy); + return basis[0] * localDir.x + basis[1] * localDir.y + basis[2] * localDir.z; + } + + // Solid-angle-measure pdf: bilinearPdf * dist2^{3/2} * rcpRectArea. + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC + { + return sampler.forwardPdf(u, cache.bilinearCache) * cache.dist2 * cache.dist2 * cache.rcpLen * rcpRectArea; + } + + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC + { + return forwardPdf(u, cache); + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl new file mode 100644 index 000000000..79268dc93 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl @@ -0,0 +1,110 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#pragma wave shader_stage(fragment) + +#include "common.hlsl" +#include "debug_vis.hlsl" +#include +#include "utils.hlsl" + +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +[[vk::push_constant]] struct PushConstantRayVis pc; + +#include "drawing.hlsl" + +struct RayVisOutput +{ + float32_t4 color : SV_Target0; + float32_t depth : SV_Depth; +}; + +// [shader("pixel")] +[[vk::location(0)]] RayVisOutput main(SVertexAttributes vx) +{ + RayVisOutput output; + output.color = float32_t4(0.0, 0.0, 0.0, 0.0); + output.depth = 0.0; // Far plane in reversed-Z (near=0, far=1) + float32_t maxDepth = 0.0; // Track closest depth (minimum in reversed-Z) + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); + + // Convert to NDC space with aspect ratio correction + float32_t2 ndcPos = vx.uv * 2.0f - 1.0f; + float32_t aspect = pc.viewport.z / pc.viewport.w; + ndcPos.x *= aspect; + VisContext::begin(ndcPos, float32_t3(0, 0, 0), aaWidth); + + // Draw vertices in 3D. clippedVertices are stored in shading-point-relative + // coords (the frag materializes with pc.shadingPoint); shift back to world. + for (uint32_t v = 0; v < DebugDataBuffer[0].silhouette.clippedVertexCount; v++) + { + float32_t3 worldVertex = DebugDataBuffer[0].silhouette.clippedVertices[v] + pc.shadingPoint; + float32_t4 clipPos = mul(pc.viewProjMatrix, float32_t4(worldVertex, 1.0)); + float32_t3 ndcPosVertex = clipPos.xyz / clipPos.w; + ndcPosVertex.x *= aspect; + if (ndcPosVertex.z < maxDepth) + continue; + + float32_t4 intensity = SphereDrawer::drawDot(ndcPosVertex, 0.03, 0.0, colorLUT[DebugDataBuffer[0].silhouette.clippedVertexIndices[v]]); + + // Update depth only where we drew something + if (intensity.a > 0.0) + { + VisContext::add(intensity); + maxDepth = max(maxDepth, 1.0f - ndcPosVertex.z); + } + } + + // Draw sample rays + for (uint32_t i = 0; i < DebugDataBuffer[0].sampling.sampleCount; i++) + { + float32_t3 rayOrigin = pc.shadingPoint; + float32_t4 directionAndPdf = DebugDataBuffer[0].sampling.rayData[i]; + float32_t3 rayDir = normalize(directionAndPdf.xyz); + + shapes::OBBView obb = shapes::OBBView::create(pc.modelMatrix); + shapes::OBBView::Intersection intersection = obb.rayIntersection(rayOrigin, rayDir); + + float32_t arrowLength; + float32_t3 arrowColor; + + if (intersection.hit) + { + // Use tMax (exit point at back face) + float32_t3 worldExitPoint = rayOrigin + rayDir * intersection.tMax; + arrowLength = intersection.tMax; + arrowColor = float32_t3(0.0, 1.0, 0.0); // Green for valid samples + } + else + { + // Ray doesn't intersect + float32_t3 cubeCenter = obb.getCenter(); + arrowLength = length(cubeCenter - rayOrigin) + 2.0; // make it a little taller + arrowColor = float32_t3(1.0, 0.0, 0.0); // Red for BROKEN samples + } + + SphereDrawer::ArrowResult arrow = SphereDrawer::visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect, pc.viewProjMatrix); + + // Only update depth if arrow was actually drawn + if (arrow.color.a > 0.0) + { + maxDepth = max(maxDepth, arrow.depth); + } + + // Modulate arrow color by its alpha (only add where arrow is visible) + VisContext::add(float32_t4(arrowColor * arrow.color.a, 0.0)); + output.color.a = max(output.color.a, arrow.color.a); + } + + // Clamp to prevent overflow + output.color.rgb += VisContext::flush().rgb; + output.color = saturate(output.color); + output.color.a = 1.0; + + // Write the closest depth (minimum in reversed-Z) + output.depth = maxDepth; + + return output; +} diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl new file mode 100644 index 000000000..7429b7400 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl @@ -0,0 +1,76 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ + +// Thin shim over the builtin OBB silhouette. The builtin (in +// nbl/builtin/hlsl/shapes/obb_silhouette.hlsl) is the source of truth for +// ClippedSilhouette / BinSilhouette / SilEdgeNormals; this file re-exports +// them at example-global scope and adds debug-recording wrappers that re-derive +// the intermediates the builtin's debug-free create() doesn't expose. +#include "common.hlsl" +#include "debug_vis.hlsl" +#include "utils.hlsl" +#include +#include +#include +#include + +using namespace nbl; +using namespace nbl::hlsl; + +// Re-export builtin types at example-global scope so existing callsites +// (ClippedSilhouette::create, BinSilhouette::data, ...) keep compiling. +using BinSilhouette = nbl::hlsl::shapes::BinSilhouette; +using ClippedSilhouette = nbl::hlsl::shapes::ClippedSilhouette; +using SilEdgeNormals = nbl::hlsl::shapes::SilEdgeNormals; + +// Debug-recording wrapper around ClippedSilhouette::create. Re-derives clipMask, +// rotateAmount, wrapAround, rotatedClipMask, rotatedSil by re-running the same +// classifier the builtin uses, then emits DebugRecorder::recordClipResult. +ClippedSilhouette createClippedSilhouetteDbg(shapes::OBBView view, float32_t3 shadingPoint) +{ + ClippedSilhouette result = ClippedSilhouette::create(view, shadingPoint); + + const float32_t3 toMin = view.minCorner - shadingPoint; + const float32_t3 sqScales = float32_t3(dot(view.columns[0], view.columns[0]), dot(view.columns[1], view.columns[1]), dot(view.columns[2], view.columns[2])); + const float32_t3 proj = -float32_t3(dot(view.columns[0], toMin), dot(view.columns[1], toMin), dot(view.columns[2], toMin)); + const uint32_t3 below = uint32_t3(proj < float32_t3(0, 0, 0)); + const uint32_t3 above = uint32_t3(proj > sqScales); + const uint32_t3 region = uint32_t3(uint32_t3(1u, 1u, 1u) + below - above); + const uint32_t configIndex = region.x + region.y * 3u + region.z * 9u; + + BinSilhouette sil = BinSilhouette::create(configIndex); + const uint32_t vertexCount = sil.getVertexCount(); + const uint32_t validMask = (1u << vertexCount) - 1u; + uint32_t clipMask = 0u; + NBL_UNROLL + for (uint32_t i = 0; i < 6; i++) + clipMask |= (hlsl::select(view.getVertexZ(sil.getVertexIndex(i)) < shadingPoint.z, 1u, 0u)) << i; + clipMask &= validMask; + const uint32_t clipCount = countbits(clipMask); + const uint32_t invertedMask = ~clipMask & validMask; + const bool wrapAround = (clipMask & (clipMask >> (vertexCount - 1))) != 0u; + const uint32_t rotateAmount = nbl::hlsl::select(wrapAround, firstbitlow(invertedMask), firstbithigh(clipMask) + 1); + const uint32_t rotatedClipMask = nbl::hlsl::rotr(clipMask, rotateAmount, vertexCount); + + DebugRecorder::recordClipResult(result.count, clipMask, clipCount, rotatedClipMask, rotateAmount, result.positiveCount, wrapAround, sil.data); + return result; +} + +// Originals tagged with their cube corner index; clip verts use sentinels 23/24. +// Replaces the ClippedSilhouette::recordVertices member that was stripped from +// the builtin. recordClippedVertex is a no-op in release. +void recordClippedSilhouetteVertices(ClippedSilhouette silhouette, float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) +{ + for (uint32_t k = 0; k < silhouette.positiveCount; k++) + DebugRecorder::recordClippedVertex(k, vertices[k], silhouette.cornerIndex(k)); + if (silhouette.count > silhouette.positiveCount) + { + DebugRecorder::recordClippedVertex(silhouette.positiveCount, vertices[silhouette.positiveCount], 23u); + DebugRecorder::recordClippedVertex(silhouette.positiveCount + 1u, vertices[silhouette.positiveCount + 1u], 24u); + } +} + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl new file mode 100644 index 000000000..364cd78e1 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl @@ -0,0 +1,125 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#pragma wave shader_stage(fragment) + +#include "common.hlsl" +#include + +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +#include "drawing.hlsl" +#include "utils.hlsl" +#include "silhouette.hlsl" +#include "triangle_sampling.hlsl" +#include "parallelogram_sampling.hlsl" +#include "pyramid_sampling.hlsl" +#include "obb_face_sampling.hlsl" + +[[vk::push_constant]] struct PushConstants pc; + +static const SAMPLING_MODE_FLAGS samplingMode = SAMPLING_MODE_FLAGS_CONST; + +template struct SelectSampler; +template<> struct SelectSampler { using type = TriangleFanSampler; }; +template<> struct SelectSampler { using type = TriangleFanSampler; }; +template<> struct SelectSampler { using type = Parallelogram; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid; }; +template<> struct SelectSampler { using type = OBBFaceSampler; }; +template<> struct SelectSampler { using type = Parallelogram; }; + +using SelectedSampler = typename SelectSampler::type; + +void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) +{ + ndc = vx.uv * 2.0f - 1.0f; + float32_t aspect = pc.viewport.z / pc.viewport.w; + ndc.x *= aspect; + + float32_t2 normalized = ndc / CIRCLE_RADIUS; + float32_t r2 = dot(normalized, normalized); + + if (r2 <= 1.0f) + { + spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); + } + else + { + float32_t uv2Plus1 = r2 + 1.0f; + spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + } + spherePos = normalize(spherePos); +} + +[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 +{ + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); + float32_t3 spherePos; + float32_t2 ndc; + computeSpherePos(vx, ndc, spherePos); + VisContext::begin(ndc, spherePos, aaWidth); + + shapes::OBBView view = shapes::OBBView::create(pc.modelMatrix); + ClippedSilhouette silhouette = createClippedSilhouetteDbg(view, pc.shadingPoint); + + SelectedSampler sampler = SelectedSampler::create(silhouette, view); + PyramidDebugVis::apply(sampler, silhouette, view); + + uint32_t validSampleCount = 0; + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5) / sqrt(pc.sampleCount) + ndc.x * 1e-9f, + (float32_t(i >> 3u) + 0.5) / sqrt(pc.sampleCount) + ndc.y * 1e-9f); + + typename SelectedSampler::cache_type cache; + const float32_t3 sampleDir = sampler.generate(xi, cache); + const float32_t pdf = sampler.forwardPdf(xi, cache); + + if (pdf > 0.0f) + { + validSampleCount++; + DebugRecorder::recordRay(i, sampleDir, pdf); + if (VisContext::enabled()) + VisContext::add(SphereDrawer::visualizeSample(sampleDir, xi, sampler.selectedIdx(cache), vx.uv)); + else + VisContext::add(float4(sampleDir * 0.02f / pdf, 1.0f)); + } + } + + // Silhouette edges + debug recording. Re-materialize verts here -- the + // sampler may have absorbed its own copy already, but `verts` is local to + // this scope and dies at function end anyway. + { + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; + silhouette.materialize(view, vertices); + recordClippedSilhouetteVertices(silhouette, vertices); + + for (uint32_t i = 0; i < silhouette.count; i++) + { + const uint32_t j = (i + 1u < silhouette.count) ? i + 1u : 0u; + const float32_t3 e0 = normalize(vertices[i]); + const float32_t3 e1 = normalize(vertices[j]); + const float32_t3 ePts[2] = {e0, e1}; + VisContext::add(SphereDrawer::drawEdge(0, ePts, aaWidth)); + } + + const uint32_t configIndex = silhouette.getConfigIndex(); + if (VisContext::enabled() && all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) + return float32_t4(colorLUT[configIndex], 1.0f); + VisContext::add(SphereDrawer::drawRing(ndc)); + + const BinSilhouette binSil = silhouette.getOriginalBinSilhouette(); + uint32_t vertexIndices[6]; + for (uint32_t i = 0; i < 6; i++) + vertexIndices[i] = uint32_t(binSil.getVertexIndex(i)); + DebugRecorder::recordFrameEnd(silhouette.getRegion(), configIndex, binSil.getVertexCount(), binSil.data, vertexIndices, validSampleCount, pc.sampleCount); + } + return VisContext::flush(); +} diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl new file mode 100644 index 000000000..d4fd9902e --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl @@ -0,0 +1,370 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ + +// Include the spherical triangle utilities +#include "common.hlsl" +#include +#include +#include +#include +#include +#include "silhouette.hlsl" + +using namespace nbl::hlsl; + +// Maximum number of triangles we can have after clipping +// Without clipping, max 3 faces can be visible at once so 3 faces * 2 triangles = 6 edges, forming max 4 triangles +// With clipping, one more edge. 7 - 2 = 5 max triangles because fanning from one vertex +#define MAX_TRIANGLES 5 + +// ============================================================================ +// TriangleFanSampler: importance-sampled fan triangulation of the clipped +// silhouette. create() takes only the silhouette and materializes verts +// internally, storing them as a member so sample() has random access without +// the caller threading verts through. +// +// All loops over silCount/triangle-count are cascade-unrolled (instead of +// `for + break`) so every `self.verts[K]` / `cdf[K]` / `triangleSolidAngles[K]` +// access has a literal slot index. This keeps the local arrays in registers +// (SROA-promoted) instead of spilling to addressable Function memory -- a +// single dynamic-index access would demote the whole array and tank every +// subsequent read. +// ============================================================================ +template +struct TriangleFanSampler +{ + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + // Cache for the TractableSampler concept. Stores the per-triangle pdf + // (selectionProb * trianglePdf) so forwardPdf is an O(1) load, plus the + // selected fan-triangle index (used by the visualization code path to + // colour each triangle differently). + struct cache_type + { + density_type pdf; + uint32_t selectedIdx; + }; + + uint32_t count; // Number of valid triangles + float32_t totalWeight; // Sum of all triangle weights (for PDF computation) + float32_t3 faceNormal; // Face normal (only used for projected mode) + float32_t cdf[MAX_TRIANGLES]; // Normalized CDF: cdf[i] = sum(weight[0..i]) / totalWeight + float32_t triangleSolidAngles[MAX_TRIANGLES]; // Raw weight per triangle (for PDF after selection) + uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) + float32_t3 verts[MAX_SILHOUETTE_VERTICES]; + + // Build fan triangulation, cache weights for triangle selection. + // Materializes silhouette verts internally (using the view stored in + // ClippedSilhouette) and keeps them as a member for sample-time access. + static TriangleFanSampler create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, shapes::OBBView view) + { + TriangleFanSampler self; + self.totalWeight = 0.0f; + self.faceNormal = float32_t3(0, 0, 0); + const uint32_t silCount = silhouette.count; + silhouette.materialize(view, self.verts); + + // Pre-zero the per-triangle arrays so unused slots are well-defined -- + // the cascade below populates exactly silCount-2 slots and we don't + // want the tail to leak garbage into the CDF. + NBL_UNROLL + for (uint32_t z = 0; z < MAX_TRIANGLES; z++) + { + self.triangleSolidAngles[z] = 0.0f; + self.triangleIndices[z] = 0u; + self.cdf[z] = 0.0f; + } + + if (silCount < 3) + { + self.count = 0; + return self; + } + + const float32_t3 v0 = self.verts[0]; + + // Compute face normal ONCE before the loop - silhouette is planar! + if (Projected) + { + const float32_t3 v1 = self.verts[1]; + const float32_t3 v2 = self.verts[2]; + self.faceNormal = normalize(cross(v1 - v0, v2 - v0)); + } + + // Fan triangulation: triangles (v0, self.verts[I], self.verts[I+1]) for I = 1..silCount-2. + // Cascade-on-silCount so each call site has literal I. + processFanTri<1>(v0, self.faceNormal, self); + if (silCount > 3) + { + processFanTri<2>(v0, self.faceNormal, self); + if (silCount > 4) + { + processFanTri<3>(v0, self.faceNormal, self); + if (silCount > 5) + { + processFanTri<4>(v0, self.faceNormal, self); + if (silCount > 6) + processFanTri<5>(v0, self.faceNormal, self); + } + } + } + // self.count = silCount - 2 (every triangle slot gets populated, possibly + // with zero weight for degenerates -- they're handled cleanly by the CDF). + self.count = silCount - 2u; + + // CDF build: cascade-on-count so cdf[K] / triangleSolidAngles[K] are + // literal-index accesses; otherwise the whole sampler struct's arrays + // would demote to Function memory. + const float32_t rcpTotal = (self.totalWeight > 0.0f) ? rcp(self.totalWeight) : 0.0f; + float32_t cumulative = 0.0f; + + cumulative += self.triangleSolidAngles[0]; + self.cdf[0] = cumulative * rcpTotal; + if (self.count > 1) + { + cumulative += self.triangleSolidAngles[1]; + self.cdf[1] = cumulative * rcpTotal; + if (self.count > 2) + { + cumulative += self.triangleSolidAngles[2]; + self.cdf[2] = cumulative * rcpTotal; + if (self.count > 3) + { + cumulative += self.triangleSolidAngles[3]; + self.cdf[3] = cumulative * rcpTotal; + if (self.count > 4) + { + cumulative += self.triangleSolidAngles[4]; + self.cdf[4] = cumulative * rcpTotal; + } + } + } + } + +#if DEBUG_DATA + // Debug-only closed-loop walk over silhouette edges. Released builds DCE + // both the loop (recordTriangleFan is a no-op stub) and luneDetected. + bool luneDetected = false; + for (uint32_t i = 0; i < silCount; i++) + { + const uint32_t j = (i + 1u < silCount) ? i + 1u : 0u; + const float32_t3 ni = nbl::hlsl::normalize(self.verts[i]); + const float32_t3 nj = nbl::hlsl::normalize(self.verts[j]); + if (dot(ni, nj) < -0.99f) + { + luneDetected = true; + assert(false && "Spherical lune detected: antipodal silhouette edge"); + } + } + DebugRecorder::recordTriangleFan(luneDetected, self.count, self.totalWeight, self.triangleSolidAngles); +#else + DebugRecorder::recordTriangleFan(false, self.count, self.totalWeight, self.triangleSolidAngles); +#endif + + return self; + } + + // TractableSampler::generate. Picks a fan triangle by xi.x via the cached + // CDF, samples within it, and registers (selectedIdx, pdf) in the cache so + // forwardPdf is an O(1) load. Geometry is reconstructed on-demand from + // `this->verts`. The CDF-select and triangle-reconstruct steps both use + // literal-index cascades on count / vertexIdx -- a single dynamic-index + // access into verts.v / cdf / triangleIndices would demote those arrays to + // Function memory and slow every call. + codomain_type generate(domain_type xi, NBL_REF_ARG(cache_type) cache) + { + // Handle empty or invalid data + if (count == 0 || totalWeight <= 0.0f) + { + cache.pdf = 0.0f; + cache.selectedIdx = 0; + return codomain_type(0, 0, 1); + } + + // Use a local idx for all the cascade work; assign to the cache once at + // the end so the cache field doesn't get pessimised by repeated stores. + uint32_t idx = count - 1u; // fall-through default for numerical roundoff + scalar_type prevCdf = 0.0f; + if (xi.x <= cdf[0]) + { + idx = 0; + } + else if (count > 1 && xi.x <= cdf[1]) + { + idx = 1; + prevCdf = cdf[0]; + } + else if (count > 2 && xi.x <= cdf[2]) + { + idx = 2; + prevCdf = cdf[1]; + } + else if (count > 3 && xi.x <= cdf[3]) + { + idx = 3; + prevCdf = cdf[2]; + } + else if (count > 4 && xi.x <= cdf[4]) + { + idx = 4; + prevCdf = cdf[3]; + } + else // fall-through to last valid triangle + { + if (count == 2) + prevCdf = cdf[0]; + else if (count == 3) + prevCdf = cdf[1]; + else if (count == 4) + prevCdf = cdf[2]; + else if (count == 5) + prevCdf = cdf[3]; + } + cache.selectedIdx = idx; + + // cdf[idx] read also via cascade so the array stays SROA'd. + scalar_type selectedCdf; + if (idx == 0) + selectedCdf = cdf[0]; + else if (idx == 1) + selectedCdf = cdf[1]; + else if (idx == 2) + selectedCdf = cdf[2]; + else if (idx == 3) + selectedCdf = cdf[3]; + else + selectedCdf = cdf[4]; + + const scalar_type cdfWidth = selectedCdf - prevCdf; + const scalar_type u = (xi.x - prevCdf) / max(cdfWidth, 1e-7f); + + scalar_type triSolidAngle; + if (idx == 0) + triSolidAngle = triangleSolidAngles[0]; + else if (idx == 1) + triSolidAngle = triangleSolidAngles[1]; + else if (idx == 2) + triSolidAngle = triangleSolidAngles[2]; + else if (idx == 3) + triSolidAngle = triangleSolidAngles[3]; + else + triSolidAngle = triangleSolidAngles[4]; + + uint32_t vertexIdx; + if (idx == 0) + vertexIdx = triangleIndices[0]; + else if (idx == 1) + vertexIdx = triangleIndices[1]; + else if (idx == 2) + vertexIdx = triangleIndices[2]; + else if (idx == 3) + vertexIdx = triangleIndices[3]; + else + vertexIdx = triangleIndices[4]; + + // Reconstruct triangle geometry. vertexIdx is in [1, MAX_SILHOUETTE_VERTICES-2] + // and is data-dependent on xi -- cascade so verts[vertexIdx] / verts[vertexIdx+1] + // become literal-index reads. With our 7-vertex max, vertexIdx <= 5. + const codomain_type v0 = verts[0]; + codomain_type v1, v2; + if (vertexIdx == 1) + { + v1 = verts[1]; + v2 = verts[2]; + } + else if (vertexIdx == 2) + { + v1 = verts[2]; + v2 = verts[3]; + } + else if (vertexIdx == 3) + { + v1 = verts[3]; + v2 = verts[4]; + } + else if (vertexIdx == 4) + { + v1 = verts[4]; + v2 = verts[5]; + } + else + { + v1 = verts[5]; + v2 = verts[6]; + } // vertexIdx == 5 + + const codomain_type origin = codomain_type(0, 0, 0); + + const codomain_type triVerts[3] = {v0, v1, v2}; + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); + + // Sample based on mode + codomain_type direction; + const domain_type u2 = domain_type(u, xi.y); + + if (Projected) + { + // faceNormal was precomputed during create(), silhouette is planar + sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri, faceNormal, false); + sampling::ProjectedSphericalTriangle::cache_type triCache; + direction = samplingTri.generate(u2, triCache); + triSolidAngle = 1.0f / samplingTri.forwardPdf(u2, triCache); + } + else + { + sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); + sampling::SphericalTriangle::cache_type triCache; + direction = samplingTri.generate(u2, triCache); + } + + // Calculate PDF: trianglePdf * selectionProb where the per-triangle pdf + // is 1/triSolidAngle (uniform over the spherical triangle) and the + // selection probability is triSolidAngle / totalWeight. + cache.pdf = (1.0f / triSolidAngle) * (triSolidAngle / totalWeight); + + return normalize(direction); + } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.selectedIdx; } + + // Process one fan triangle (v0, self.verts[I], self.verts[I+1]) at the cascade level. + // I is a template constant so self.verts[I] / self.verts[I+1] / triangleSolidAngles[I-1] + // / triangleIndices[I-1] are all literal-index accesses; the body's + // append-to-slot-(I-1) only works because we treat degenerate triangles as + // zero-weight rather than skipping them. This is a behavior change from the + // old `count++ on non-degenerate` form: degenerate triangles now occupy a + // slot with zero weight, which contributes nothing to the CDF and has + // selection probability 0, so the sampling result is unchanged. + template + static void processFanTri(float32_t3 v0, float32_t3 faceNormal, NBL_REF_ARG(TriangleFanSampler) self) + { + const float32_t3 v1 = self.verts[I]; + const float32_t3 v2 = self.verts[I + 1]; + + const float32_t3 origin = float32_t3(0, 0, 0); + const float32_t3 triVerts[3] = {v0, v1, v2}; + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); + + // Compute solid angle (or projected) and clamp to >= 0; degenerate + // triangles end up with zero weight and don't affect sampling. + float32_t sa = Projected ? shapeTri.projectedSolidAngle(faceNormal) : shapeTri.solid_angle; + sa = max(sa, 0.0f); + + self.triangleSolidAngles[I - 1u] = sa; + self.triangleIndices[I - 1u] = I; + self.totalWeight += sa; + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl new file mode 100644 index 000000000..5100b2fc0 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl @@ -0,0 +1,31 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ +#include +#include +#include + +// unused +uint32_t packSilhouette(const uint32_t s[7]) +{ + uint32_t packed = 0; + uint32_t size = s[0] & 0x7; // 3 bits for size + + // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) + for (uint32_t i = 1; i <= 6; ++i) + { + uint32_t v = s[i]; + if (v < 0) + v = 0; // replace unused vertices with 0 + packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) + } + + // Put size in the MSB (bits 29-31 for a 32-bit uint32_t, leaving 29 bits for vertices) + packed |= (size & 0x7) << 29; + + return packed; +} + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/config.json.template b/73_SolidAngleVisualizer/config.json.template new file mode 100644 index 000000000..f961745c1 --- /dev/null +++ b/73_SolidAngleVisualizer/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/73_SolidAngleVisualizer/include/common.hpp b/73_SolidAngleVisualizer/include/common.hpp new file mode 100644 index 000000000..fe7d086dd --- /dev/null +++ b/73_SolidAngleVisualizer/include/common.hpp @@ -0,0 +1,19 @@ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ + + +#include "nbl/examples/examples.hpp" + +// the example's headers +#include "transform.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ \ No newline at end of file diff --git a/73_SolidAngleVisualizer/include/transform.hpp b/73_SolidAngleVisualizer/include/transform.hpp new file mode 100644 index 000000000..ecacae17d --- /dev/null +++ b/73_SolidAngleVisualizer/include/transform.hpp @@ -0,0 +1,213 @@ +#ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ + +#include "nbl/ui/ICursorControl.h" +#include "nbl/ext/ImGui/ImGui.h" +#include "imgui/imgui_internal.h" +#include "imguizmo/ImGuizmo.h" + +struct TransformRequestParams +{ + uint8_t sceneTexDescIx = ~0; + bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = true; +}; + +struct TransformReturnInfo +{ + nbl::hlsl::uint16_t2 sceneResolution = { 1, 1 }; + bool allowCameraMovement = false; +}; + +TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params) +{ + static ImGuizmo::OPERATION mCurrentGizmoOperation(ImGuizmo::TRANSLATE); + static ImGuizmo::MODE mCurrentGizmoMode(ImGuizmo::LOCAL); + static bool useSnap = false; + static float snap[3] = { 1.f, 1.f, 1.f }; + static float bounds[] = { 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f }; + static float boundsSnap[] = { 0.1f, 0.1f, 0.1f }; + static bool boundSizing = false; + static bool boundSizingSnap = false; + + ImGui::Text("Use gizmo (T/R/G) or ViewManipulate widget to transform the cube"); + + if (params.editTransformDecomposition) + { + if (ImGui::IsKeyPressed(ImGuiKey_T)) + mCurrentGizmoOperation = ImGuizmo::TRANSLATE; + if (ImGui::IsKeyPressed(ImGuiKey_R)) + mCurrentGizmoOperation = ImGuizmo::ROTATE; + if (ImGui::IsKeyPressed(ImGuiKey_G)) + mCurrentGizmoOperation = ImGuizmo::SCALE; + if (ImGui::RadioButton("Translate", mCurrentGizmoOperation == ImGuizmo::TRANSLATE)) + mCurrentGizmoOperation = ImGuizmo::TRANSLATE; + ImGui::SameLine(); + if (ImGui::RadioButton("Rotate", mCurrentGizmoOperation == ImGuizmo::ROTATE)) + mCurrentGizmoOperation = ImGuizmo::ROTATE; + ImGui::SameLine(); + if (ImGui::RadioButton("Scale", mCurrentGizmoOperation == ImGuizmo::SCALE)) + mCurrentGizmoOperation = ImGuizmo::SCALE; + if (ImGui::RadioButton("Universal", mCurrentGizmoOperation == ImGuizmo::UNIVERSAL)) + mCurrentGizmoOperation = ImGuizmo::UNIVERSAL; + + // For UI editing, decompose temporarily + float matrixTranslation[3], matrixRotation[3], matrixScale[3]; + ImGuizmo::DecomposeMatrixToComponents(matrix, matrixTranslation, matrixRotation, matrixScale); + ImGui::DragFloat3("Tr", matrixTranslation, 0.01f); + ImGui::DragFloat3("Rt", matrixRotation, 0.01f); + ImGui::DragFloat3("Sc", matrixScale, 0.01f); + ImGuizmo::RecomposeMatrixFromComponents(matrixTranslation, matrixRotation, matrixScale, matrix); + + if (mCurrentGizmoOperation != ImGuizmo::SCALE) + { + if (ImGui::RadioButton("Local", mCurrentGizmoMode == ImGuizmo::LOCAL)) + mCurrentGizmoMode = ImGuizmo::LOCAL; + ImGui::SameLine(); + if (ImGui::RadioButton("World", mCurrentGizmoMode == ImGuizmo::WORLD)) + mCurrentGizmoMode = ImGuizmo::WORLD; + } + if (ImGui::IsKeyPressed(ImGuiKey_S) && ImGui::IsKeyPressed(ImGuiKey_LeftShift)) + useSnap = !useSnap; + ImGui::Checkbox("##UseSnap", &useSnap); + ImGui::SameLine(); + + switch (mCurrentGizmoOperation) + { + case ImGuizmo::TRANSLATE: + ImGui::InputFloat3("Snap", &snap[0]); + break; + case ImGuizmo::ROTATE: + ImGui::InputFloat("Angle Snap", &snap[0]); + break; + case ImGuizmo::SCALE: + ImGui::InputFloat("Scale Snap", &snap[0]); + break; + } + ImGui::Checkbox("Bound Sizing", &boundSizing); + if (boundSizing) + { + ImGui::PushID(3); + ImGui::Checkbox("##BoundSizing", &boundSizingSnap); + ImGui::SameLine(); + ImGui::InputFloat3("Snap", boundsSnap); + ImGui::PopID(); + } + } + + ImGuiIO& io = ImGui::GetIO(); + float viewManipulateRight = io.DisplaySize.x; + float viewManipulateTop = 0; + bool isWindowHovered = false; + static ImGuiWindowFlags gizmoWindowFlags = 0; + + /* + for the "useWindow" case we just render to a gui area, + otherwise to fake full screen transparent window + + note that for both cases we make sure gizmo being + rendered is aligned to our texture scene using + imgui "cursor" screen positions + */ + // TODO: this shouldn't be handled here I think + SImResourceInfo info; + info.textureID = params.sceneTexDescIx; + info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER; + + TransformReturnInfo retval; + if (params.useWindow) + { + ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); + ImGui::SetNextWindowPos(ImVec2(400, 20), ImGuiCond_Appearing); + ImGui::PushStyleColor(ImGuiCol_WindowBg, (ImVec4)ImColor(0.35f, 0.3f, 0.3f)); + ImGui::Begin("Gizmo", 0, gizmoWindowFlags); + ImGuizmo::SetDrawlist(); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + ImVec2 windowPos = ImGui::GetWindowPos(); + ImVec2 cursorPos = ImGui::GetCursorScreenPos(); + isWindowHovered = ImGui::IsWindowHovered(); + + ImGui::Image(info, contentRegionSize); + ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); + retval.sceneResolution = { contentRegionSize.x,contentRegionSize.y }; + + viewManipulateRight = cursorPos.x + contentRegionSize.x; + viewManipulateTop = cursorPos.y; + + ImGuiWindow* window = ImGui::GetCurrentWindow(); + gizmoWindowFlags = (isWindowHovered && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0); + } + else + { + ImGui::SetNextWindowPos(ImVec2(0, 0)); + ImGui::SetNextWindowSize(io.DisplaySize); + ImGui::PushStyleColor(ImGuiCol_WindowBg, ImVec4(0, 0, 0, 0)); // fully transparent fake window + ImGui::Begin("FullScreenWindow", nullptr, ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoScrollWithMouse | ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoBringToFrontOnFocus | ImGuiWindowFlags_NoBackground | ImGuiWindowFlags_NoInputs); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + ImVec2 cursorPos = ImGui::GetCursorScreenPos(); + isWindowHovered = ImGui::IsWindowHovered(); + + ImGui::Image(info, contentRegionSize); + ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); + retval.sceneResolution = { contentRegionSize.x,contentRegionSize.y }; + + viewManipulateRight = cursorPos.x + contentRegionSize.x; + viewManipulateTop = cursorPos.y; + } + + // Standard Manipulate gizmo - let ImGuizmo modify the matrix directly + ImGuizmo::Manipulate(cameraView, cameraProjection, mCurrentGizmoOperation, mCurrentGizmoMode, matrix, NULL, useSnap ? &snap[0] : NULL, boundSizing ? bounds : NULL, boundSizingSnap ? boundsSnap : NULL); + + retval.allowCameraMovement = isWindowHovered && !ImGuizmo::IsUsing(); + + // ViewManipulate for rotating the view + if (params.enableViewManipulate) + { + // Store original translation and scale before ViewManipulate + // Decompose original matrix + nbl::hlsl::float32_t3 translation, rotation, scale; + ImGuizmo::DecomposeMatrixToComponents(matrix, &translation.x, &rotation.x, &scale.x); + // Create rotation-only matrix + nbl::hlsl::float32_t4x4 temp; + nbl::hlsl::float32_t3 baseTranslation(0.0f); + nbl::hlsl::float32_t3 baseScale(1.0f); + ImGuizmo::RecomposeMatrixFromComponents(&baseTranslation.x, &rotation.x, &baseScale.x, &temp[0][0]); + temp = nbl::hlsl::transpose(temp); + + // Invert to make it "view-like" + nbl::hlsl::float32_t4x4 tempInv = nbl::hlsl::inverse(temp); + + // Create flip matrix (flip X to fix left/right) + nbl::hlsl::float32_t4x4 flip(1.0f); + flip[0][0] = -1.0f; // Flip X axis + + // Apply flip to the inverted matrix + tempInv = nbl::hlsl::mul(nbl::hlsl::mul(flip, tempInv), flip); + + // Manipulate + ImGuizmo::ViewManipulate(&tempInv[0][0], 1.0f, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010); + + // Undo flip (flip is its own inverse, so multiply by flip again) + tempInv = nbl::hlsl::mul(nbl::hlsl::mul(flip, tempInv), flip); + + // Invert back to model space + temp = nbl::hlsl::inverse(tempInv); + temp = nbl::hlsl::transpose(temp); + + // Extract rotation + nbl::hlsl::float32_t3 newRot; + ImGuizmo::DecomposeMatrixToComponents(&temp[0][0], &baseTranslation.x, &newRot.x, &baseScale.x); + // Recompose original matrix with new rotation but keep translation & scale + ImGuizmo::RecomposeMatrixFromComponents(&translation.x, &newRot.x, &scale.x, matrix); + + retval.allowCameraMovement &= isWindowHovered && !ImGuizmo::IsUsingViewManipulate(); + } + + ImGui::End(); + ImGui::PopStyleColor(); + + return retval; +} + +#endif // _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ \ No newline at end of file diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp new file mode 100644 index 000000000..680f5b460 --- /dev/null +++ b/73_SolidAngleVisualizer/main.cpp @@ -0,0 +1,2034 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#include "nbl/this_example/builtin/build/spirv/keys.hpp" + +#include "app_resources/hlsl/benchmark/common.hlsl" +#include "app_resources/hlsl/common.hlsl" +#include "common.hpp" +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" +#include +#include + +//#include "app_resources/hlsl/silhouette.hlsl" +//#include "app_resources/hlsl/parallelogram_sampling.hlsl" +//#include "app_resources/hlsl/pyramid_sampling.hlsl" +//#include "app_resources/hlsl/triangle_sampling.hlsl" +//#include + +// ============================================================================ +// Compile-time concept verification (mirrors example 37 main.cpp). Each +// example sampler must satisfy TractableSampler: +// typedef domain_type, codomain_type, density_type, cache_type +// codomain_type generate(domain_type, ref cache_type) +// density_type forwardPdf(domain_type, cache_type) +// SphericalPyramid is checked across all four (UseCaliper, InnerSampler) +// pairs that the frag shader / benchmark actually instantiate. +// ============================================================================ + +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); + +// App execution mode -- pick at compile time via -DAPP_MODE=N +// APP_MODE_VISUALIZER (1) full visualization with debug + ImGui editor (default) +// APP_MODE_NSIGHT_BENCHMARKS(2) submits one dispatch per SAMPLING_MODE_FLAGS in a single capture, then exits +#define APP_MODE_VISUALIZER 1 +#define APP_MODE_NSIGHT_BENCHMARKS 2 +#ifndef APP_MODE +#define APP_MODE APP_MODE_VISUALIZER +#endif + +/* +Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window. + +Written with Nabla's UI extension and got integrated with ImGuizmo to handle scene's object translations. +*/ +class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinResourcesApplication +{ + using device_base_t = MonoWindowApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), + device_base_t({2048, 1024}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { + } + + virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.pipelineExecutableInfo = true; + return retval; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + interface.m_visualizer = this; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, {m_cmdBufs.data() + i, 1})) + return logFail("Couldn't create Command Buffer!"); + } + +#if APP_MODE == APP_MODE_VISUALIZER + const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; + m_scene = CGeometryCreatorScene::create( + {.transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies}, + CSimpleDebugRenderer::DefaultPolygonGeometryPatch); +#endif + + // for the scene drawing pass + { + IGPURenderpass::SCreationParams params = {}; + const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { + {{{.format = sceneRenderDepthFormat, + .samples = IGPUImage::ESCF_1_BIT, + .mayAlias = false}, + /*.loadOp =*/ {IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp =*/ {IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout =*/ {IGPUImage::LAYOUT::UNDEFINED}, + /*.finalLayout =*/ {IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd}; + params.depthStencilAttachments = depthAttachments; + const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { + {{ + {.format = finalSceneRenderFormat, + .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, + .mayAlias = false}, + /*.loadOp =*/IGPURenderpass::LOAD_OP::CLEAR, + /*.storeOp =*/IGPURenderpass::STORE_OP::STORE, + /*.initialLayout =*/IGPUImage::LAYOUT::UNDEFINED, + /*.finalLayout =*/IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read + }}, + IGPURenderpass::SCreationParams::ColorAttachmentsEnd}; + params.colorAttachments = colorAttachments; + IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { + {}, + IGPURenderpass::SCreationParams::SubpassesEnd}; + subpasses[0].depthStencilAttachment = {{.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}; + subpasses[0].colorAttachments[0] = {.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}; + params.subpasses = subpasses; + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later + // while color is sampled by ImGUI + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, + // don't want any writes to be available, as we are clearing both attachments + .srcAccessMask = ACCESS_FLAGS::NONE, + // destination needs to wait as early as possible + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // leave view offsets and flags default + }, + { + .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = {// last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, + // the ImGUI will sample the color, then next frame we overwrite both attachments + .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, + // but we only care about the availability-visibility chain between renderpass and imgui + .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT} + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd}; + params.dependencies = dependencies; + auto solidAngleRenderpassParams = params; + m_mainRenderpass = m_device->createRenderpass(std::move(params)); + if (!m_mainRenderpass) + return logFail("Failed to create Main Renderpass!"); + + m_solidAngleRenderpass = m_device->createRenderpass(std::move(solidAngleRenderpassParams)); + if (!m_solidAngleRenderpass) + return logFail("Failed to create Solid Angle Renderpass!"); + } + +#if APP_MODE == APP_MODE_VISUALIZER + const auto& geometries = m_scene->getInitParams().geometries; + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, {&geometries.front().get(), geometries.size()}); + // special case + { + const auto& pipelines = m_renderer->getInitParams().pipelines; + auto ix = 0u; + for (const auto& name : m_scene->getInitParams().geometryNames) + { + if (name == "Cone") + m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; + ix++; + } + } + // we'll only display one thing at a time + m_renderer->m_instances.resize(1); +#endif + + // Create graphics pipeline + { + auto loadPrecompiledShader = [&](auto key) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + m_logger->log("Could not load precompiled shader!", ILogger::ELL_ERROR); + std::exit(-1); + } + assert(assets.size() == 1); + auto shader = IAsset::castDown(assets[0]); + if (!shader) + { + m_logger->log("Failed to load precompiled shader!", ILogger::ELL_ERROR); + std::exit(-1); + } + return shader; + }; + + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + smart_refctd_ptr saVisShaders[SAMPLING_MODE_FLAGS::Count * DebugPermutations]; + + auto addSaVis = [&](SAMPLING_MODE_FLAGS mode) + { + saVisShaders[denseIdOf(mode) * DebugPermutations + 0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + saVisShaders[denseIdOf(mode) * DebugPermutations + 1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + }; + + addSaVis.template operator()<"sa_vis_tri_sa", "sa_vis_tri_sa_dbg">(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE); + addSaVis.template operator()<"sa_vis_tri_psa", "sa_vis_tri_psa_dbg">(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE); + addSaVis.template operator()<"sa_vis_para", "sa_vis_para_dbg">(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); + addSaVis.template operator()<"sa_vis_rectangle", "sa_vis_rectangle_dbg">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID); + addSaVis.template operator()<"sa_vis_bilinear", "sa_vis_bilinear_dbg">(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID); + addSaVis.template operator()<"sa_vis_proj_rectangle", "sa_vis_proj_rectangle_dbg">(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID); + addSaVis.template operator()<"sa_vis_silhouette", "sa_vis_silhouette_dbg">(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY); + addSaVis.template operator()<"sa_vis_pyramid", "sa_vis_pyramid_dbg">(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY); + addSaVis.template operator()<"sa_vis_caliper_pyramid", "sa_vis_caliper_pyramid_dbg">(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); + addSaVis.template operator()<"sa_vis_caliper_rectangle", "sa_vis_caliper_rectangle_dbg">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID); + addSaVis.template operator()<"sa_vis_obb_face", "sa_vis_obb_face_dbg">(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT); + + smart_refctd_ptr rayVisShaders[DebugPermutations]; + rayVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis">(m_device.get())); + rayVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis_dbg">(m_device.get())); + + smart_refctd_ptr solidAngleVisLayout, rayVisLayout; + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = + { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_FRAGMENT, + .count = 1}}; + smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); + + const asset::SPushConstantRange saRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstants)}}; + const asset::SPushConstantRange rayRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstantRayVis)}}; + + if (!dsLayout) + logFail("Failed to create a Descriptor Layout!\n"); + + solidAngleVisLayout = m_device->createPipelineLayout(saRanges, dsLayout); + + rayVisLayout = m_device->createPipelineLayout(rayRanges, dsLayout); + + { + // Create all SolidAngleVis pipeline variants + for (uint32_t i = 0; i < SAMPLING_MODE_FLAGS::Count * DebugPermutations; i++) + { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = saVisShaders[i].get(), + .entryPoint = "main"}; + m_solidAngleVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, solidAngleVisLayout.get(), m_solidAngleRenderpass.get()); + if (!m_solidAngleVisPipelines[i]) + return logFail("Could not create SolidAngleVis Graphics Pipeline variant %d!", i); + } + + asset::SRasterizationParams rasterParams = ext::FullScreenTriangle::ProtoPipeline::DefaultRasterParams; + rasterParams.depthWriteEnable = true; + rasterParams.depthCompareOp = asset::E_COMPARE_OP::ECO_GREATER; + + // Create all RayVis pipeline variants + for (uint32_t i = 0; i < DebugPermutations; i++) + { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = rayVisShaders[i].get(), + .entryPoint = "main"}; + m_rayVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, rayVisLayout.get(), m_mainRenderpass.get(), 0, {}, rasterParams); + if (!m_rayVisPipelines[i]) + return logFail("Could not create RayVis Graphics Pipeline variant %d!", i); + } + } + // Allocate the memory + { + constexpr size_t BufferSize = sizeof(ResultData); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + m_outputStorageBuffer = m_device->createBuffer(std::move(params)); + if (!m_outputStorageBuffer) + logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + m_outputStorageBuffer->setObjectDebugName("ResultData output buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputStorageBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_allocation = m_device->allocate(reqs, m_outputStorageBuffer.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(m_outputStorageBuffer); + info[0].info.buffer = {.offset = 0, .size = BufferSize}; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; + m_device->updateDescriptorSets(writes, {}); + } + } + + if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)) + logFail("Failed to map the Device Memory!\n"); + + // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches + const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); + if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memoryRange); + } + +#if APP_MODE == APP_MODE_VISUALIZER + // Create ImGUI + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::imgui::UI::SCreationParameters params = {}; + params.resources.texturesInfo = {.setIx = 0u, .bindingIx = TexturesImGUIBindingIndex}; + params.resources.samplersInfo = {.setIx = 0u, .bindingIx = 1u}; + params.utilities = m_utils; + params.transfer = getTransferUpQueue(); + params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures); + params.assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + params.renderpass = smart_refctd_ptr(scRes->getRenderpass()); + params.subpassIx = 0u; + params.pipelineCache = nullptr; + interface.imGUI = ext::imgui::UI::create(std::move(params)); + if (!interface.imGUI) + return logFail("Failed to create `nbl::ext::imgui::UI` class"); + } + + // create rest of User Interface + { + auto* imgui = interface.imGUI.get(); + // create the suballocated descriptor set + { + // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, {&layout, 1}); + auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); + interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); + if (!interface.subAllocDS) + return logFail("Failed to create the descriptor set"); + // make sure Texture Atlas slot is taken for eternity + { + auto dummy = SubAllocatedDescriptorSet::invalid_value; + interface.subAllocDS->multi_allocate(0, 1, &dummy); + assert(dummy == ext::imgui::UI::FontAtlasTexId); + } + // write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = smart_refctd_ptr(interface.imGUI->getFontAtlasView()); + info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = ext::imgui::UI::FontAtlasTexId, + .count = 1, + .info = &info}; + if (!m_device->updateDescriptorSets({&write, 1}, {})) + return logFail("Failed to write the descriptor set"); + } + imgui->registerListener([this]() + { interface(); }); + } + + interface.camera.mapKeysToWASD(); +#endif + +#if APP_MODE == APP_MODE_NSIGHT_BENCHMARKS + // The actual one-shot runs from inside the first renderFrame() so NSight's Shader Profiler has + // the same render-loop context as the working UI-button-triggered benchmark. Just seed the OBB + // matrix here from the default TRS so the bench shaders see sane inputs. + ImGuizmo::RecomposeMatrixFromComponents(&interface.m_TRS.translation.x, &interface.m_TRS.rotation.x, &interface.m_TRS.scale.x, &interface.m_OBBModelMatrix[0][0]); +#endif + onAppInitializedFinish(); + return true; + } + + virtual inline bool keepRunning() override + { + if (!m_keepRunning) + return false; + return device_base_t::keepRunning(); + } + + // + virtual inline bool onAppTerminated() + { +#if APP_MODE == APP_MODE_VISUALIZER + SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId; + IGPUDescriptorSet::SDropDescriptorSet dummy[1]; + interface.subAllocDS->multi_deallocate(dummy, TexturesImGUIBindingIndex, 1, &fontAtlasDescIx); +#endif + return device_base_t::onAppTerminated(); + } + + inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override + { +#if APP_MODE == APP_MODE_NSIGHT_BENCHMARKS + // Minimal frame: run the one-shot once (inside the render loop so NSight's Shader Profiler + // has the same context as the UI-triggered benchmark), then submit a bare swapchain clear + // to satisfy the framework's frame contract, and signal exit on the next loop iteration. + if (!m_nsightBenchDone) + { + SamplingBenchmark(*this).runNSightOneShot(); + m_nsightBenchDone = true; + m_keepRunning = false; + } + + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + auto* scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + {.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = {.offset = {0, 0}, .extent = {m_window->getWidth(), m_window->getHeight()}}}; + beginRenderpass(cb, renderpassInfo); + cb->endRenderPass(); + } + cb->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + {.semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = {{.cmdbuf = cb }}; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + {.semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE }}; + const IQueue::SSubmitInfo infos[] = { + {.waitSemaphores = acquired, .commandBuffers = commandBuffers, .signalSemaphores = {&retval, 1}}}; + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; + m_realFrameIx--; + } + return retval; +#else + // CPU events + update(nextPresentationTimestamp); + + { + const auto& virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; + if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] || + !m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1]) + recreateFramebuffers(); + } + + // + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + if (m_solidAngleViewFramebuffer) + { + asset::SBufferRange range { + .offset = 0, + .size = m_outputStorageBuffer->getSize(), + .buffer = m_outputStorageBuffer}; + cb->fillBuffer(range, 0u); + { + const auto& creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); + cb->beginDebugMarker("Draw Circle View Frame"); + { + const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_solidAngleViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}}}; + beginRenderpass(cb, renderpassInfo); + } + // draw scene + { + static uint32_t lastFrameSeed = 0u; + lastFrameSeed = m_frameSeeding ? static_cast(m_realFrameIx) : lastFrameSeed; + PushConstants pc { + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, + .shadingPoint = interface.m_ShadingPoint, + .sampleCount = static_cast(m_SampleCount), + .frameIndex = lastFrameSeed}; + const uint32_t debugIdx = m_debugVisualization ? 1u : 0u; + auto pipeline = m_solidAngleVisPipelines[denseIdOf(m_samplingMode) * DebugPermutations + debugIdx]; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + + if (m_debugVisualization) + { + m_device->waitIdle(); + std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); + m_device->waitIdle(); + } + } + // draw main view + if (m_mainViewFramebuffer) + { + { + auto creationParams = m_mainViewFramebuffer->getCreationParameters(); + const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.1f, 0.1f, 0.1f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_mainViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}}}; + beginRenderpass(cb, renderpassInfo); + } + { // draw rays visualization + auto creationParams = m_mainViewFramebuffer->getCreationParameters(); + + cb->beginDebugMarker("Draw Rays visualization"); + // draw scene + { + float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); + float32_t3x4 view = *reinterpret_cast(&interface.camera.getViewMatrix()); + PushConstantRayVis pc { + .viewProjMatrix = viewProj, + .viewMatrix = view, + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .invModelMatrix = hlsl::float32_t3x4(hlsl::transpose(hlsl::inverse(interface.m_OBBModelMatrix))), + .shadingPoint = interface.m_ShadingPoint, + .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, + .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u}; + auto pipeline = m_rayVisPipelines[m_debugVisualization ? 1u : 0u]; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endDebugMarker(); + } + // draw scene + { + cb->beginDebugMarker("Main Scene Frame"); + + float32_t3x4 viewMatrix; + float32_t4x4 viewProjMatrix; + // TODO: get rid of legacy matrices + { + const auto& camera = interface.camera; + memcpy(&viewMatrix, &camera.getViewMatrix(), sizeof(viewMatrix)); + memcpy(&viewProjMatrix, &camera.getConcatenatedMatrix(), sizeof(viewProjMatrix)); + } + const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix); + + // tear down scene every frame + auto& instance = m_renderer->m_instances[0]; + instance.world = float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)); + instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; + m_renderer->render(cb, viewParams); // draw the cube/OBB + + { + // Disk visualizes the shading point; move it to interface.m_ShadingPoint. + float32_t3x4 diskWorld(1.0f); + diskWorld[0][3] = interface.m_ShadingPoint.x; + diskWorld[1][3] = interface.m_ShadingPoint.y; + diskWorld[2][3] = interface.m_ShadingPoint.z; + instance.world = diskWorld; + } + instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk + m_renderer->render(cb, viewParams); + } + + cb->endDebugMarker(); + cb->endRenderPass(); + } + + { + cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = { + .offset = {0, 0}, + .extent = {m_window->getWidth(), m_window->getHeight()}}}; + beginRenderpass(cb, renderpassInfo); + } + // draw ImGUI + { + auto* imgui = interface.imGUI.get(); + auto* pipeline = imgui->getPipeline(); + cb->bindGraphicsPipeline(pipeline); + // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx + const auto* ds = interface.subAllocDS->getDescriptorSet(); + cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds); + // a timepoint in the future to release streaming resources for geometry + const ISemaphore::SWaitInfo drawFinished = {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u}; + if (!imgui->render(cb, drawFinished)) + { + m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR); + return {}; + } + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + cb->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cb}}; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + {.semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE}}; + const IQueue::SSubmitInfo infos[] = + { + {.waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval, 1}}}; + + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal + m_realFrameIx--; + } + + m_window->setCaption("[Nabla Engine] UI App Test Demo"); + return retval; +#endif + } + + protected: + const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override + { + // Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // don't want any writes to be available, we'll clear, only thing to worry about is the layout transition + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway + .srcAccessMask = ACCESS_FLAGS::NONE, + // layout transition needs to finish before the color write + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // leave view offsets and flags default + }, + // want layout transition to begin after all color output is done + { + .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd}; + return dependencies; + } + + private: + inline void update(const std::chrono::microseconds nextPresentationTimestamp) + { + auto& camera = interface.camera; + camera.setMoveSpeed(interface.moveSpeed); + camera.setRotateSpeed(interface.rotateSpeed); + + m_inputSystem->getDefaultMouse(&mouse); + m_inputSystem->getDefaultKeyboard(&keyboard); + + struct + { + std::vector mouse {}; + std::vector keyboard {}; + } uiEvents; + + // TODO: should be a member really + static std::chrono::microseconds previousEventTimestamp {}; + + // I think begin/end should always be called on camera, just events shouldn't be fed, why? + // If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to + // `perActionDt` becoming obnoxiously large the first time the even processing resumes due to + // `timeDiff` being computed since `lastVirtualUpTimeStamp` + camera.beginInputProcessing(nextPresentationTimestamp); + { + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (interface.move) + camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + else + camera.mouseKeysUp(); + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + uiEvents.mouse.emplace_back(e); + + //if (e.type == nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) + //{ + // interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll)); + // interface.gcIndex = core::clamp(interface.gcIndex, 0ull, m_renderer->getGeometries().size() - 1); + //} + } }, + m_logger.get()); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + if (interface.move) + camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + uiEvents.keyboard.emplace_back(e); + } }, + m_logger.get()); + } + camera.endInputProcessing(nextPresentationTimestamp); + + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + + ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()), + .displaySize = {m_window->getWidth(), m_window->getHeight()}, + .mouseEvents = uiEvents.mouse, + .keyboardEvents = uiEvents.keyboard}; + + // interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; + interface.imGUI->update(params); + } + + void recreateFramebuffers() + { + auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format) -> smart_refctd_ptr + { + auto image = m_device->createImage({{.type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = format, + .extent = {resolution.x, resolution.y, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT}}); + if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) + return nullptr; + IGPUImageView::SCreationParams params = { + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = format}; + params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; + return m_device->createImageView(std::move(params)); + }; + + smart_refctd_ptr solidAngleView; + smart_refctd_ptr mainView; + const uint16_t2 solidAngleViewRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const uint16_t2 mainViewRes = interface.mainViewTransformReturnInfo.sceneResolution; + + // detect window minimization + if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 || mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000) + { + solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); + auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); + m_solidAngleViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_solidAngleRenderpass, + .depthStencilAttachments = &solidAngleDepthView.get(), + .colorAttachments = &solidAngleView.get(), + .width = solidAngleViewRes.x, + .height = solidAngleViewRes.y}}); + + mainView = createImageAndView(mainViewRes, finalSceneRenderFormat); + auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat); + m_mainViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_mainRenderpass, + .depthStencilAttachments = &mainDepthView.get(), + .colorAttachments = &mainView.get(), + .width = mainViewRes.x, + .height = mainViewRes.y}}); + } + else + { + m_solidAngleViewFramebuffer = nullptr; + m_mainViewFramebuffer = nullptr; + } + + // release previous slot and its image + interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1}); + // + if (solidAngleView && mainView) + { + interface.subAllocDS->multi_allocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices); + // update descriptor set + IGPUDescriptorSet::SDescriptorInfo infos[static_cast(CInterface::Count)] = {}; + infos[0].desc = mainView; + infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + infos[1].desc = solidAngleView; + infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast(CInterface::Count)] = { + {.dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)]}, + {.dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)]}}; + m_device->updateDescriptorSets({write, static_cast(CInterface::Count)}, {}); + } + interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW]; + } + + inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + { + cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + cb->setScissor(0, 1, &info.renderArea); + const SViewport viewport = { + .x = 0, + .y = 0, + .width = static_cast(info.renderArea.extent.width), + .height = static_cast(info.renderArea.extent.height)}; + cb->setViewport(0u, 1u, &viewport); + } + + ~SolidAngleVisualizer() override + { + m_allocation.memory->unmap(); + } + + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT; + constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB; + constexpr static inline auto TexturesImGUIBindingIndex = 0u; + // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes + constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; + + static inline SAMPLING_MODE_FLAGS m_samplingMode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID; + static inline bool m_debugVisualization = true; + static inline int m_SampleCount = 64; + static inline int m_BenchmarkSampleCount = 128; + static inline bool m_frameSeeding = true; + static inline ResultData m_GPUOutResulData; + bool m_keepRunning = true; + bool m_nsightBenchDone = false; + // + smart_refctd_ptr m_scene; + smart_refctd_ptr m_solidAngleRenderpass; + smart_refctd_ptr m_mainRenderpass; + smart_refctd_ptr m_renderer; + smart_refctd_ptr m_solidAngleViewFramebuffer; + smart_refctd_ptr m_mainViewFramebuffer; + // Pipeline variants: SolidAngleVis indexed by [mode * 2 + debugFlag], RayVis by [debugFlag] + static constexpr uint32_t DebugPermutations = 2; + smart_refctd_ptr m_solidAngleVisPipelines[SAMPLING_MODE_FLAGS::Count * DebugPermutations]; + smart_refctd_ptr m_rayVisPipelines[DebugPermutations]; + // + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_outputStorageBuffer; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + // + InputSystem::ChannelReader mouse; + InputSystem::ChannelReader keyboard; + // UI stuff + struct CInterface + { + void operator()() + { + ImGuiIO& io = ImGui::GetIO(); + + // TODO: why is this a lambda and not just an assignment in a scope ? + camera.setProjectionMatrix([&]() + { + hlsl::float32_t4x4 projection; + + if (isPerspective) + if (isLH) + projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); // TODO: why do I need to divide aspect ratio by 2? + else + projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); + else + { + float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; + + if (isLH) + projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); + else + projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); + } + + return projection; + }()); + + ImGuizmo::SetOrthographic(!isPerspective); + ImGuizmo::BeginFrame(); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Editor"); + + ImGui::Text("Benchmarking Solid Angle Visualizer"); + + if (ImGui::Button("Run Benchmark")) + { + SolidAngleVisualizer::SamplingBenchmark benchmark(*m_visualizer); + benchmark.run(); + } + ImGui::Separator(); + + ImGui::Text("Sampling Mode:"); + ImGui::SameLine(); + + const char* samplingModes[SAMPLING_MODE_FLAGS::CountWithoutCreateOnly] = {}; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "Spherical Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "Caliper Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "Projected Spherical Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE)] = "Spherical Triangle"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE)] = "Projected Spherical Triangle"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE)] = "Projected Parallelogram"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID)] = "Bilinear Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT)] = "OBB Face Direct"; + + int currentMode = static_cast(denseIdOf(m_samplingMode)); + + if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, SAMPLING_MODE_FLAGS::CountWithoutCreateOnly)) + { + m_samplingMode = kAllModes[currentMode]; + } + + ImGui::Checkbox("Debug Visualization", &m_debugVisualization); + ImGui::Text("Pipeline idx: SA=%d, Ray=%d", static_cast(denseIdOf(m_samplingMode)) * DebugPermutations + (m_debugVisualization ? 1 : 0), m_debugVisualization ? 1 : 0); + ImGui::Checkbox("Frame seeding", &m_frameSeeding); + + ImGui::SliderInt("Sample Count", &m_SampleCount, 0, 512); + ImGui::SliderInt("Benchmark Sample Count", &m_BenchmarkSampleCount, 0, 8096); + + ImGui::Separator(); + + ImGui::Text("Camera"); + + if (ImGui::RadioButton("LH", isLH)) + isLH = true; + + ImGui::SameLine(); + + if (ImGui::RadioButton("RH", !isLH)) + isLH = false; + + if (ImGui::RadioButton("Perspective", isPerspective)) + isPerspective = true; + + ImGui::SameLine(); + + if (ImGui::RadioButton("Orthographic", !isPerspective)) + isPerspective = false; + + ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); + // ImGui::Checkbox("Enable camera movement", &move); + ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); + + // ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case + + if (isPerspective) + ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); + else + ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20); + + ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + + if (firstFrame) + { + camera.setPosition(cameraIntialPosition); + camera.setTarget(cameraInitialTarget); + camera.setUpVector(cameraInitialUp); + + camera.recomputeViewMatrix(); + } + firstFrame = false; + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + if (ImGuizmo::IsUsing()) + { + ImGui::Text("Using gizmo"); + } + else + { + ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : ""); + } + ImGui::Separator(); + + /* + * ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout + * and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection + + - VIEW: + + ImGuizmo + + | X[0] Y[0] Z[0] 0.0f | + | X[1] Y[1] Z[1] 0.0f | + | X[2] Y[2] Z[2] 0.0f | + | -Dot(X, eye) -Dot(Y, eye) -Dot(Z, eye) 1.0f | + + Nabla + + | X[0] X[1] X[2] -Dot(X, eye) | + | Y[0] Y[1] Y[2] -Dot(Y, eye) | + | Z[0] Z[1] Z[2] -Dot(Z, eye) | + + = transpose(nbl::core::matrix4SIMD()) + + - PERSPECTIVE [PROJECTION CASE]: + + ImGuizmo + + | (temp / temp2) (0.0) (0.0) (0.0) | + | (0.0) (temp / temp3) (0.0) (0.0) | + | ((right + left) / temp2) ((top + bottom) / temp3) ((-zfar - znear) / temp4) (-1.0f) | + | (0.0) (0.0) ((-temp * zfar) / temp4) (0.0) | + + Nabla + + | w (0.0) (0.0) (0.0) | + | (0.0) -h (0.0) (0.0) | + | (0.0) (0.0) (-zFar/(zFar-zNear)) (-zNear*zFar/(zFar-zNear)) | + | (0.0) (0.0) (-1.0) (0.0) | + + = transpose() + + * + * the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object, + * note it also modifies input view matrix but projection matrix is immutable + */ + + if (ImGui::IsKeyPressed(ImGuiKey_End)) + { + m_TRS = TRS {}; + } + + { + static struct + { + float32_t4x4 view, projection, model; + } imguizmoM16InOut; + + ImGuizmo::SetID(0u); + + // TODO: camera will return hlsl::float32_tMxN + auto view = camera.getViewMatrix(); + imguizmoM16InOut.view = hlsl::transpose(hlsl::math::linalg::promote_affine<4, 4>(view)); + + // TODO: camera will return hlsl::float32_tMxN + imguizmoM16InOut.projection = hlsl::transpose(camera.getProjectionMatrix()); + + if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates + imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ + + transformParams.editTransformDecomposition = true; + + // Target selector: OBB (full TRS) or ShadingPoint (translation-only). + // The same EditTransform/Manipulate widget drives whichever is selected; + // we just swap which matrix it operates on and decompose accordingly. + { + int target = static_cast(m_GizmoTarget); + ImGui::Text("Gizmo target:"); + ImGui::SameLine(); + if (ImGui::RadioButton("OBB", &target, static_cast(GizmoTarget::OBB))) + m_GizmoTarget = GizmoTarget::OBB; + ImGui::SameLine(); + if (ImGui::RadioButton("Shading Point", &target, static_cast(GizmoTarget::ShadingPoint))) + m_GizmoTarget = GizmoTarget::ShadingPoint; + } + + if (m_GizmoTarget == GizmoTarget::OBB) + { + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); + + mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + move = mainViewTransformReturnInfo.allowCameraMovement; + + ImGuizmo::DecomposeMatrixToComponents(&imguizmoM16InOut.model[0][0], &m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x); + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); + } + else + { + // ShadingPoint mode: build identity-rotation/unit-scale matrix + // with translation = m_ShadingPoint; only the translation column + // round-trips through the gizmo. + float32_t3 spRotation {0.0f}; + float32_t3 spScale {1.0f}; + ImGuizmo::RecomposeMatrixFromComponents(&m_ShadingPoint.x, &spRotation.x, &spScale.x, &imguizmoM16InOut.model[0][0]); + + mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + move = mainViewTransformReturnInfo.allowCameraMovement; + + ImGuizmo::DecomposeMatrixToComponents(&imguizmoM16InOut.model[0][0], &m_ShadingPoint.x, &spRotation.x, &spScale.x); + } + } + // object meta display + //{ + // ImGui::Begin("Object"); + // ImGui::Text("type: \"%s\"", objectName.data()); + // ImGui::End(); + //} + + // solid angle view window + { + ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); + ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing); + static bool isOpen = true; + ImGui::Begin("Projected Solid Angle View", &isOpen, 0); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast(contentRegionSize.x), static_cast(contentRegionSize.y)); + solidAngleViewTransformReturnInfo.allowCameraMovement = false; // not used in this view + ImGui::Image({renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW]}, contentRegionSize); + ImGui::End(); + } + + // Show data coming from GPU + if (m_debugVisualization) + { + if (ImGui::Begin("Result Data")) + { + auto drawColorField = [&](const char* fieldName, uint32_t index) + { + ImGui::Text("%s: %u", fieldName, index); + + if (index >= 27) + { + ImGui::SameLine(); + ImGui::Text(""); + return; + } + + const auto& c = colorLUT[index]; // uses the combined LUT we made earlier + + ImGui::SameLine(); + + // Color preview button + ImGui::ColorButton( + fieldName, + ImVec4(c.r, c.g, c.b, 1.0f), + 0, + ImVec2(20, 20)); + + ImGui::SameLine(); + ImGui::Text("%s", colorNames[index]); + }; + + // Vertices + if (ImGui::CollapsingHeader("Vertices", ImGuiTreeNodeFlags_DefaultOpen)) + { + for (uint32_t i = 0; i < 6; ++i) + { + if (i < m_GPUOutResulData.silhouette.silhouetteVertexCount) + { + ImGui::Text("corners[%u]", i); + ImGui::SameLine(); + drawColorField(":", m_GPUOutResulData.silhouette.vertices[i]); + ImGui::SameLine(); + static const float32_t3 constCorners[8] = { + float32_t3(0, 0, 0), float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(1, 1, 0), + float32_t3(0, 0, 1), float32_t3(1, 0, 1), float32_t3(0, 1, 1), float32_t3(1, 1, 1)}; + float32_t3 vertexLocation = constCorners[m_GPUOutResulData.silhouette.vertices[i]]; + ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z); + } + else + { + ImGui::Text("corners[%u] :: ", i); + ImGui::SameLine(); + ImGui::ColorButton( + "", + ImVec4(0.0f, 0.0f, 0.0f, 0.0f), + 0, + ImVec2(20, 20)); + ImGui::SameLine(); + ImGui::Text(""); + } + } + } + + if (ImGui::CollapsingHeader("Color LUT Map")) + { + for (int i = 0; i < 27; i++) + drawColorField(" ", i); + } + + ImGui::Separator(); + ImGui::Text("Valid Samples: %u / %u", m_GPUOutResulData.sampling.validSampleCount / hlsl::max(m_GPUOutResulData.sampling.threadCount, 1u), m_GPUOutResulData.sampling.sampleCount); + ImGui::ProgressBar(static_cast(m_GPUOutResulData.sampling.validSampleCount / hlsl::max(m_GPUOutResulData.sampling.threadCount, 1u)) / static_cast(m_GPUOutResulData.sampling.sampleCount)); + ImGui::Separator(); + + // Silhouette + if (ImGui::CollapsingHeader("Silhouette")) + { + drawColorField("silhouetteIndex", m_GPUOutResulData.silhouette.silhouetteIndex); + ImGui::Text("Region: (%u, %u, %u)", m_GPUOutResulData.silhouette.region.x, m_GPUOutResulData.silhouette.region.y, m_GPUOutResulData.silhouette.region.z); + ImGui::Text("Silhouette Vertex Count: %u", m_GPUOutResulData.silhouette.silhouetteVertexCount); + ImGui::Text("Positive Vertex Count: %u", m_GPUOutResulData.silhouette.positiveVertCount); + ImGui::Text("Edge Visibility Mismatch: %s", m_GPUOutResulData.silhouette.edgeVisibilityMismatch ? "true" : "false"); + ImGui::Text("Max Triangles Exceeded: %s", m_GPUOutResulData.triangleFan.maxTrianglesExceeded ? "true" : "false"); + for (uint32_t i = 0; i < 6; i++) + ImGui::Text("Vertex[%u]: %u", i, m_GPUOutResulData.silhouette.vertices[i]); + ImGui::Text("Clipped Silhouette Vertex Count: %u", m_GPUOutResulData.silhouette.clippedVertexCount); + for (uint32_t i = 0; i < 7; i++) + ImGui::Text("Clipped Vertex[%u]: (%.3f, %.3f, %.3f) Index: %u", i, + m_GPUOutResulData.silhouette.clippedVertices[i].x, + m_GPUOutResulData.silhouette.clippedVertices[i].y, + m_GPUOutResulData.silhouette.clippedVertices[i].z, + m_GPUOutResulData.silhouette.clippedVertexIndices[i]); + + // Silhouette mask printed in binary + auto printBin = [](uint32_t bin, const char* name) + { + char buf[33]; + for (int i = 0; i < 32; i++) + buf[i] = (bin & (1u << (31 - i))) ? '1' : '0'; + buf[32] = '\0'; + ImGui::Text("%s: 0x%08X", name, bin); + ImGui::Text("binary: 0b%s", buf); + ImGui::Separator(); + }; + printBin(m_GPUOutResulData.silhouette.silhouette, "Silhouette"); + printBin(m_GPUOutResulData.silhouette.rotatedSil, "rotatedSilhouette"); + + printBin(m_GPUOutResulData.silhouette.clipCount, "clipCount"); + printBin(m_GPUOutResulData.silhouette.clipMask, "clipMask"); + printBin(m_GPUOutResulData.silhouette.rotatedClipMask, "rotatedClipMask"); + printBin(m_GPUOutResulData.silhouette.rotateAmount, "rotateAmount"); + printBin(m_GPUOutResulData.silhouette.wrapAround, "wrapAround"); + } + + // Parallelogram + if (m_samplingMode & FLAG_PARALLELOGRAM && ImGui::CollapsingHeader("Projected Parallelogram", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Area: %.3f", m_GPUOutResulData.parallelogram.area); + ImGui::Text("N3 Mask: 0x%02X", m_GPUOutResulData.parallelogram.n3Mask); + for (uint32_t i = 0; i < 4; i++) + { + bool convex = m_GPUOutResulData.parallelogram.edgeIsConvex[i] != 0; + bool n3 = (m_GPUOutResulData.parallelogram.n3Mask >> i) & 1u; + ImGui::Text("Edge[%u]: %s%s", i, + convex ? "convex" : "concave", + n3 ? " (N3 split)" : ""); + } + for (uint32_t i = 0; i < 4; i++) + ImGui::Text("Corner[%u]: (%.3f, %.3f)", i, m_GPUOutResulData.parallelogram.corners[i].x, m_GPUOutResulData.parallelogram.corners[i].y); + } + else if ((m_samplingMode & FLAG_PYRAMID) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Best Caliper Edge: %u", m_GPUOutResulData.pyramid.bestEdge); + ImGui::Separator(); + + ImGui::Text("Axis 1: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramid.axis1.x, m_GPUOutResulData.pyramid.axis1.y, m_GPUOutResulData.pyramid.axis1.z); + ImGui::Text(" Half-Width: %.4f Offset: %.4f", + m_GPUOutResulData.pyramid.halfWidth1, m_GPUOutResulData.pyramid.offset1); + ImGui::Text(" Bounds: [%.4f, %.4f]", + m_GPUOutResulData.pyramid.min1, m_GPUOutResulData.pyramid.max1); + + ImGui::Text("Axis 2: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramid.axis2.x, m_GPUOutResulData.pyramid.axis2.y, m_GPUOutResulData.pyramid.axis2.z); + ImGui::Text(" Half-Width: %.4f Offset: %.4f", + m_GPUOutResulData.pyramid.halfWidth2, m_GPUOutResulData.pyramid.offset2); + ImGui::Text(" Bounds: [%.4f, %.4f]", + m_GPUOutResulData.pyramid.min2, m_GPUOutResulData.pyramid.max2); + + ImGui::Separator(); + ImGui::Text("Center: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramid.center.x, m_GPUOutResulData.pyramid.center.y, m_GPUOutResulData.pyramid.center.z); + ImGui::Text("Solid Angle (bound): %.6f sr", m_GPUOutResulData.pyramid.solidAngle); + } + else if (m_samplingMode & FLAG_TRIANGLE && ImGui::CollapsingHeader("Spherical Triangle", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Spherical Lune Detected: %s", m_GPUOutResulData.triangleFan.sphericalLuneDetected ? "true" : "false"); + ImGui::Text("Triangle Count: %u", m_GPUOutResulData.triangleFan.triangleCount); + // print solidAngles for each triangle + { + ImGui::Text("Solid Angles per Triangle:"); + ImGui::BeginTable("SolidAnglesTable", 2); + ImGui::TableSetupColumn("Triangle Index"); + ImGui::TableSetupColumn("Solid Angle"); + ImGui::TableHeadersRow(); + for (uint32_t i = 0; i < m_GPUOutResulData.triangleFan.triangleCount; ++i) + { + ImGui::TableNextRow(); + ImGui::TableSetColumnIndex(0); + ImGui::Text("%u", i); + ImGui::TableSetColumnIndex(1); + ImGui::Text("%.6f", m_GPUOutResulData.triangleFan.solidAngles[i]); + } + ImGui::Text("Total: %.6f", m_GPUOutResulData.triangleFan.totalSolidAngles); + ImGui::EndTable(); + } + } + + { + float32_t3 xAxis = m_OBBModelMatrix[0].xyz; + float32_t3 yAxis = m_OBBModelMatrix[1].xyz; + float32_t3 zAxis = m_OBBModelMatrix[2].xyz; + + float32_t3 nx = normalize(xAxis); + float32_t3 ny = normalize(yAxis); + float32_t3 nz = normalize(zAxis); + + const float epsilon = 1e-4; + bool hasSkew = false; + if (abs(dot(nx, ny)) > epsilon || abs(dot(nx, nz)) > epsilon || abs(dot(ny, nz)) > epsilon) + hasSkew = true; + ImGui::Separator(); + ImGui::Text("Matrix Has Skew: %s", hasSkew ? "true" : "false"); + } + + static bool modalShown = false; + static bool modalDismissed = false; + static uint32_t lastSilhouetteIndex = ~0u; + + // Reset modal flags if silhouette configuration changed + if (m_GPUOutResulData.silhouette.silhouetteIndex != lastSilhouetteIndex) + { + modalShown = false; + modalDismissed = false; // Allow modal to show again for new configuration + lastSilhouetteIndex = m_GPUOutResulData.silhouette.silhouetteIndex; + } + + // Reset flags when mismatch is cleared + if (!m_GPUOutResulData.silhouette.edgeVisibilityMismatch && !m_GPUOutResulData.triangleFan.maxTrianglesExceeded && !m_GPUOutResulData.triangleFan.sphericalLuneDetected) + { + modalShown = false; + modalDismissed = false; + } + + // Open modal only if not already shown/dismissed + if ((m_GPUOutResulData.silhouette.edgeVisibilityMismatch || m_GPUOutResulData.triangleFan.maxTrianglesExceeded || m_GPUOutResulData.triangleFan.sphericalLuneDetected) && m_GPUOutResulData.silhouette.silhouetteIndex != 13 && !modalShown && !modalDismissed) // Don't reopen if user dismissed it + { + ImGui::OpenPopup("Edge Visibility Mismatch Warning"); + modalShown = true; + } + + // Modal popup + if (ImGui::BeginPopupModal("Edge Visibility Mismatch Warning", NULL, ImGuiWindowFlags_AlwaysAutoResize)) + { + ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Warning: Edge Visibility Mismatch Detected!"); + ImGui::Separator(); + ImGui::Text("The silhouette lookup table (LUT) does not match the computed edge visibility."); + ImGui::Text("This indicates the pre-computed silhouette data may be incorrect."); + ImGui::Spacing(); + ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouette.silhouetteIndex); + ImGui::TextWrapped("Region: (%u, %u, %u)", m_GPUOutResulData.silhouette.region.x, m_GPUOutResulData.silhouette.region.y, m_GPUOutResulData.silhouette.region.z); + ImGui::Spacing(); + ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.silhouette.edgeVisibilityMismatch); + ImGui::Text("Vertices involved in mismatched edges:"); + ImGui::Indent(); + for (int i = 0; i < 8; i++) + { + if (m_GPUOutResulData.silhouette.edgeVisibilityMismatch & (1u << i)) + { + ImGui::BulletText("Vertex %d", i); + } + } + ImGui::Unindent(); + ImGui::Spacing(); + if (ImGui::Button("OK", ImVec2(120, 0))) + { + ImGui::CloseCurrentPopup(); + modalShown = false; + modalDismissed = true; // Mark as dismissed to prevent reopening + } + ImGui::EndPopup(); + } + } + ImGui::End(); + } + + // view matrices editor + { + ImGui::Begin("Matrices"); + + auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true) + { + ImGui::Text(topText); + if (ImGui::BeginTable(tableName, columns)) + { + for (int y = 0; y < rows; ++y) + { + ImGui::TableNextRow(); + for (int x = 0; x < columns; ++x) + { + ImGui::TableSetColumnIndex(x); + ImGui::Text("%.3f", *(pointer + (y * columns) + x)); + } + } + ImGui::EndTable(); + } + + if (withSeparator) + ImGui::Separator(); + }; + + static RandomSampler rng(0x45); // Initialize RNG with seed + + // Helper function to check if cube intersects unit sphere at origin + auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool + { + float cubeRadius = glm::length(scale) * 0.5f; + float distanceToCenter = glm::length(translation); + return (distanceToCenter - cubeRadius) > 1.0f; + }; + + static TRS lastTRS = {}; + if (ImGui::Button("Randomize Translation")) + { + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do + { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + ImGui::SameLine(); + if (ImGui::Button("Randomize Rotation")) + { + lastTRS = m_TRS; // Backup before randomizing + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + } + ImGui::SameLine(); + if (ImGui::Button("Randomize Scale")) + { + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do + { + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + // ImGui::SameLine(); + if (ImGui::Button("Randomize All")) + { + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do + { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + ImGui::SameLine(); + if (ImGui::Button("Revert to Last")) + { + m_TRS = lastTRS; // Restore backed-up TRS + } + + addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); + addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, &camera.getViewMatrix()[0].x); + addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, &camera.getProjectionMatrix()[0].x, false); + + ImGui::End(); + } + + // Nabla Imgui backend MDI buffer info + // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, + // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. + { + auto* streaminingBuffer = imGUI->getStreamingBuffer(); + + const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested + const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available + const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer + + float freePercentage = 100.0f * (float)(freeSize) / (float)total; + float allocatedPercentage = (float)(consumedMemory) / (float)total; + + ImVec2 barSize = ImVec2(400, 30); + float windowPadding = 10.0f; + float verticalPadding = ImGui::GetStyle().FramePadding.y; + + ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always); + ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar); + + ImGui::Text("Total Allocated Size: %zu bytes", total); + ImGui::Text("In use: %zu bytes", consumedMemory); + ImGui::Text("Buffer Usage:"); + + ImGui::SetCursorPosX(windowPadding); + + if (freePercentage > 70.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green + else if (freePercentage > 30.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow + else + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red + + ImGui::ProgressBar(allocatedPercentage, barSize, ""); + + ImGui::PopStyleColor(); + + ImDrawList* drawList = ImGui::GetWindowDrawList(); + + ImVec2 progressBarPos = ImGui::GetItemRectMin(); + ImVec2 progressBarSize = ImGui::GetItemRectSize(); + + const char* text = "%.2f%% free"; + char textBuffer[64]; + snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); + + ImVec2 textSize = ImGui::CalcTextSize(textBuffer); + ImVec2 textPos = ImVec2( + progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, + progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f); + + ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); + drawList->AddRectFilled( + ImVec2(textPos.x - 5, textPos.y - 2), + ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), + ImGui::GetColorU32(bgColor)); + + ImGui::SetCursorScreenPos(textPos); + ImGui::Text("%s", textBuffer); + + ImGui::Dummy(ImVec2(0.0f, verticalPadding)); + + ImGui::End(); + } + ImGui::End(); + + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &m_OBBModelMatrix[0][0]); + } + + smart_refctd_ptr imGUI; + + // descriptor set + smart_refctd_ptr subAllocDS; + enum E_RENDER_VIEWS : uint8_t + { + ERV_MAIN_VIEW, + ERV_SOLID_ANGLE_VIEW, + Count + }; + SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = {SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value}; + // + Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, {}, 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f)); + // mutables + struct TRS // Source of truth + { + float32_t3 translation {0.0f, 0.0f, 1.5f}; + float32_t3 rotation {0.0f}; // MUST stay orthonormal + float32_t3 scale {1.0f}; + } m_TRS; + float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS + float32_t3 m_ShadingPoint {0.0f, 0.0f, 0.0f}; // world-space observer; samplers operate in shading-point-relative coords + enum class GizmoTarget : uint8_t + { + OBB, + ShadingPoint + }; + GizmoTarget m_GizmoTarget = GizmoTarget::OBB; // which entity the manipulator gizmo currently drives + + // std::string_view objectName; + TransformRequestParams transformParams; + TransformReturnInfo mainViewTransformReturnInfo; + TransformReturnInfo solidAngleViewTransformReturnInfo; + + const static inline core::vectorSIMDf cameraIntialPosition {-3.0f, 6.0f, 3.0f}; + const static inline core::vectorSIMDf cameraInitialTarget {0.f, 0.0f, 3.f}; + const static inline core::vectorSIMDf cameraInitialUp {0.f, 0.f, 1.f}; + + float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; + float viewWidth = 10.f; + // uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; + bool firstFrame = true; + + SolidAngleVisualizer* m_visualizer; + } interface; + + class SamplingBenchmark final + { + public: + SamplingBenchmark(SolidAngleVisualizer& base) + : m_api(base.m_api), m_device(base.m_device), m_logger(base.m_logger), m_visualizer(&base) + { + // setting up pipeline in the constructor + m_queueFamily = base.getComputeQueue()->getFamilyIndex(); + m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + base.logFail("Failed to create Command Buffers!\n"); + + // Load shaders, set up pipelines (one per sampling mode) + { + auto loadShader = [&](auto key) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = base.m_logger.get(); + lp.workingDirectory = "app_resources"; + auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + base.logFail("Could not load shader!"); + assert(0); + } + assert(assets.size() == 1); + auto shader = IAsset::castDown(assets[0]); + if (!shader) + base.logFail("Failed to load precompiled benchmark shader!\n"); + return shader; + }; + + const char* shaderNames[SAMPLING_MODE_FLAGS::Count] = {}; + smart_refctd_ptr shaders[SAMPLING_MODE_FLAGS::Count]; + + auto addBench = [&](SAMPLING_MODE_FLAGS mode) + { + shaderNames[denseIdOf(mode)] = Key.value; + shaders[denseIdOf(mode)] = loadShader(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + }; + + addBench.template operator()<"benchmark_tri_sa">(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE); + addBench.template operator()<"benchmark_tri_psa">(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE); + addBench.template operator()<"benchmark_para">(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); + addBench.template operator()<"benchmark_rectangle">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID); + addBench.template operator()<"benchmark_bilinear">(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID); + addBench.template operator()<"benchmark_proj_rectangle">(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID); + addBench.template operator()<"benchmark_silhouette">(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY); + addBench.template operator()<"benchmark_pyramid_creation">(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY); + addBench.template operator()<"benchmark_caliper_pyramid_creation">(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); + addBench.template operator()<"benchmark_caliper_rectangle">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID); + addBench.template operator()<"benchmark_obb_face_direct">(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT); + + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1}}; + smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + base.logFail("Failed to create a Descriptor Layout!\n"); + + SPushConstantRange pushConstantRanges[] = { + {.stageFlags = ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(BenchmarkPushConstants)}}; + m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); + if (!m_pplnLayout) + base.logFail("Failed to create a Pipeline Layout!\n"); + + for (uint32_t i = 0; i < SAMPLING_MODE_FLAGS::Count; i++) + { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pplnLayout.get(); + params.shader.entryPoint = "main"; + params.shader.shader = shaders[i].get(); + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + } + if (!base.m_device->createComputePipelines(nullptr, {¶ms, 1}, &m_pipelines[i])) + base.logFail("Failed to create pipelines (compile & link shaders)!\n"); + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + m_pipelineReports[i] = system::to_string(m_pipelines[i]->getExecutableInfo()); + m_pipelineReportNames[i] = shaderNames[i]; + } + } + + // Allocate the memory + { + constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + smart_refctd_ptr dummyBuff = base.m_device->createBuffer(std::move(params)); + if (!dummyBuff) + base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + dummyBuff->setObjectDebugName("benchmark buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); + + m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(dummyBuff); + info[0].info.buffer = {.offset = 0, .size = BufferSize}; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; + base.m_device->updateDescriptorSets(writes, {}); + } + } + } + + IQueryPool::SCreationParams queryPoolCreationParams {}; + queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolCreationParams.queryCount = 2; + queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(queryPoolCreationParams); + + m_computeQueue = m_device->getQueue(m_queueFamily, 0); + m_physicalDevice = base.m_device->getPhysicalDevice(); + m_timestampPeriodNs = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + } + + void run() + { + // Pipeline executable reports first so the timings cluster at the bottom of the log. + for (uint32_t i = 0; i < SAMPLING_MODE_FLAGS::Count; i++) + { + if (!m_pipelineReports[i].empty()) + m_logger->log("%s Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, m_pipelineReportNames[i], m_pipelineReports[i].c_str()); + } + + const uint64_t totalThreads = (uint64_t)BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X; + m_logger->log("\n\n=== GPU Sampler Benchmarks (%d dispatches, %llu threads/dispatch, %d samples/thread, ps/sample is per all GPU threads) ===", + ILogger::ELL_PERFORMANCE, Dispatches, totalThreads, m_BenchmarkSampleCount); + m_logger->log(" timestampPeriod = %.1f ps/tick", ILogger::ELL_PERFORMANCE, m_timestampPeriodNs * 1000.0); + m_logger->log("%-29s | %-12s | %9s | %10s | %10s", + ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total"); + + struct SamplerEntry + { + const char* name; + SAMPLING_MODE_FLAGS mode; + }; + const SamplerEntry samplers[] = { + {.name = "PYRAMID_RECTANGLE", .mode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID}, + {.name = "CALIPER_PYRAMID_RECTANGLE", .mode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID}, + {.name = "PYRAMID_PROJ_RECTANGLE", .mode = SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID}, + {.name = "PYRAMID_BILINEAR", .mode = SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID}, + {.name = "PARALLELOGRAM", .mode = SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, + {.name = "TRIANGLE_SA", .mode = SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE}, + {.name = "TRIANGLE_PSA", .mode = SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE}, + {.name = "OBB_FACE_DIRECT", .mode = SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT}, + }; + + // Creation-only modes: report per-creation, not per-sample. + performBenchmark("SILHOUETTE_CREATION_ONLY", SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY, totalThreads, 0); + performBenchmark("PYRAMID_CREATION_ONLY", SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY, totalThreads, 0); + performBenchmark("CALIPER_PYRAMID_CREATION_ONLY", SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY, totalThreads, 0); + + // Modes per sampler: 1 creation per N samples. 1 = no amortization, sampleCount = full amortization. + const uint32_t modeRatios[] = {1u, 16u, static_cast(m_BenchmarkSampleCount)}; + for (uint32_t spc : modeRatios) + for (const auto& s : samplers) + performBenchmark(s.name, s.mode, totalThreads, spc); + } + + // Many dispatches per SAMPLING_MODE_FLAGS, all in a single capture. Intended for NSight submit-mode + // captures with the Shader Profiler -- each mode's range needs sustained execution so PC sampling + // can gather enough source-line hits. + void runNSightOneShot() + { + const char* modeNames[SAMPLING_MODE_FLAGS::Count] = {}; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "CALIPER_PYRAMID_RECTANGLE"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "PYRAMID_RECTANGLE"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "PYRAMID_PROJ_RECTANGLE"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE)] = "TRIANGLE_SA"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE)] = "TRIANGLE_PSA"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE)] = "PARALLELOGRAM"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID)] = "PYRAMID_BILINEAR"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY)] = "SILHOUETTE_CREATION_ONLY"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY)] = "PYRAMID_CREATION_ONLY"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY)] = "CALIPER_PYRAMID_CREATION_ONLY"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT)] = "OBB_FACE_DIRECT"; + + m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); + m_pushConstants.shadingPoint = m_visualizer->interface.m_ShadingPoint; + m_pushConstants.sampleCount = static_cast(m_BenchmarkSampleCount); + m_pushConstants.samplesPerCreation = m_pushConstants.sampleCount; // full amortization: 1 creation per dispatch + + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); + + const asset::SMemoryBarrier serializeDispatch = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo barrierInfo = {.memBarriers = {&serializeDispatch, 1}}; + + for (uint32_t mode = 0; mode < SAMPLING_MODE_FLAGS::Count; ++mode) + { + m_cmdbuf->beginDebugMarker(modeNames[mode], vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipelines[mode].get()); + for (int i = 0; i < NSightDispatchesPerMode; ++i) + { + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + if (i + 1 < NSightDispatchesPerMode) + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + m_cmdbuf->endDebugMarker(); + if (mode + 1u < SAMPLING_MODE_FLAGS::Count) + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + m_cmdbuf->end(); + + smart_refctd_ptr done = m_device->createSemaphore(0); + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = done.get(), .value = 1, .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; + submitInfos[0].commandBuffers = cmdbufs; + submitInfos[0].signalSemaphores = signals; + + m_api->startCapture(); + m_computeQueue->submit(submitInfos); + const ISemaphore::SWaitInfo waitInfo[] = {{.semaphore = done.get(), .value = 1}}; + m_device->blockForSemaphores(waitInfo); + m_api->endCapture(); + + m_logger->log("NSight benchmarks: dispatched %u sampling modes in one submit.", ILogger::ELL_INFO, static_cast(SAMPLING_MODE_FLAGS::Count)); + } + + private: + // samplesPerCreation: > 0 selects sampling mode with that 1:N ratio; 0 means create-only mode (label "create-only"). + void performBenchmark(const char* name, SAMPLING_MODE_FLAGS mode, uint64_t totalThreads, uint32_t samplesPerCreation) + { + m_device->waitIdle(); + + m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); + m_pushConstants.shadingPoint = m_visualizer->interface.m_ShadingPoint; + m_pushConstants.sampleCount = m_BenchmarkSampleCount; + // For create-only modes the inner loop is unused; pick any divisor of sampleCount to keep the shader's `creations = sampleCount / samplesPerCreation` well-defined. + m_pushConstants.samplesPerCreation = mode & FLAG_CREATE_ONLY ? uint32_t(m_BenchmarkSampleCount) : samplesPerCreation; + recordCmdBuff(mode); + + // Nabla's IQueue::submit rejects submissions without a signal semaphore + // (SSubmitInfo::valid() requires signalSemaphores non-empty so the + // submission's resources can be tracked on a timeline). + smart_refctd_ptr done = m_device->createSemaphore(0); + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = done.get(), .value = 1, .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; + submitInfos[0].commandBuffers = cmdbufs; + submitInfos[0].signalSemaphores = signals; + + m_api->startCapture(); + m_computeQueue->submit(submitInfos); + const ISemaphore::SWaitInfo waitInfo[] = {{.semaphore = done.get(), .value = 1}}; + m_device->blockForSemaphores(waitInfo); + m_api->endCapture(); + + const float64_t elapsed_ps = float64_t(calcTimeElapsed()) * m_timestampPeriodNs * 1000.0; + + const uint64_t totalOps = uint64_t(Dispatches) * totalThreads * uint64_t(m_BenchmarkSampleCount); + const float64_t ps_per_op = elapsed_ps / float64_t(totalOps); + const float64_t gops_per_s = float64_t(totalOps) / elapsed_ps * 1e3; // ops / (ps × 1e-12) / 1e9 + const float64_t elapsed_ms = elapsed_ps * 1e-9; + + char modeBuf[16]; + if (mode & FLAG_CREATE_ONLY) + snprintf(modeBuf, sizeof(modeBuf), "create-only"); + else + snprintf(modeBuf, sizeof(modeBuf), "1:%u", samplesPerCreation); + + m_logger->log("%-29s | %-12s | %9.2f | %10.2f | %10.3f", ILogger::ELL_PERFORMANCE, name, modeBuf, ps_per_op, gops_per_s, elapsed_ms); + } + + void recordCmdBuff(SAMPLING_MODE_FLAGS mode) const + { + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + m_cmdbuf->beginDebugMarker("sampling compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipelines[denseIdOf(mode)].get()); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); + + // Serialize back-to-back dispatches so each completes before the next begins + // (matches the original semaphore-chain methodology — measurement is per-dispatch + // time, not pipelined throughput). + const asset::SMemoryBarrier serializeDispatch = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo barrierInfo = {.memBarriers = {&serializeDispatch, 1}}; + + for (int i = 0; i < WarmupDispatches; ++i) + { + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + + for (int i = 0; i < Dispatches; ++i) + { + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + if (i + 1 < Dispatches) + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + m_cmdbuf->endDebugMarker(); + m_cmdbuf->end(); + } + + uint64_t calcTimeElapsed() const + { + uint64_t timestamps[2]; + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, ×tamps, sizeof(uint64_t), flags); + return timestamps[1] - timestamps[0]; + } + + private: + core::smart_refctd_ptr m_api; + smart_refctd_ptr m_device; + smart_refctd_ptr m_logger; + SolidAngleVisualizer* m_visualizer; + + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_cmdpool = nullptr; + smart_refctd_ptr m_cmdbuf = nullptr; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_pplnLayout = nullptr; + BenchmarkPushConstants m_pushConstants; + smart_refctd_ptr m_pipelines[SAMPLING_MODE_FLAGS::Count]; + + smart_refctd_ptr m_queryPool = nullptr; + + std::string m_pipelineReports[SAMPLING_MODE_FLAGS::Count]; + const char* m_pipelineReportNames[SAMPLING_MODE_FLAGS::Count] = {}; + + uint32_t m_queueFamily; + IQueue* m_computeQueue; + const nbl::video::IPhysicalDevice* m_physicalDevice = nullptr; + float64_t m_timestampPeriodNs = 1.0; + static constexpr int WarmupDispatches = 100; + static constexpr int Dispatches = 1000; + // PC sampling needs sustained execution per range; one dispatch is too short. Tune up if NSight still reports too few samples. + static constexpr int NSightDispatchesPerMode = 16; + }; + + template + inline bool logFail(const char* msg, Args&&... args) + { + m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); + return false; + } + + std::ofstream m_logFile; +}; + +NBL_MAIN_FUNC(SolidAngleVisualizer) \ No newline at end of file diff --git a/73_SolidAngleVisualizer/pipeline.groovy b/73_SolidAngleVisualizer/pipeline.groovy new file mode 100644 index 000000000..7b7c9702a --- /dev/null +++ b/73_SolidAngleVisualizer/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CUIBuilder extends IBuilder +{ + public CUIBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CUIBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/73_SolidAngleVisualizer/src/transform.cpp b/73_SolidAngleVisualizer/src/transform.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c0695775..fbfc7c9cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -105,6 +105,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) add_subdirectory(73_ImageUploadBenchmark) + add_subdirectory(73_SolidAngleVisualizer) if (NBL_BUILD_MITSUBA_LOADER) add_subdirectory(73_GeometryInspector) diff --git a/common/include/nbl/examples/Benchmark/BenchmarkCli.h b/common/include/nbl/examples/Benchmark/BenchmarkCli.h new file mode 100644 index 000000000..abb0912da --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkCli.h @@ -0,0 +1,125 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_CLI_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_CLI_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace benchmark_cli +{ + +struct ParsedArgs +{ + std::string outputPath; + bool noBaseline = false; + bool noColor = false; + bool helpRequested = false; + std::vector> baselines; // (label, path) + nbl::core::vector> focus; + // Median-of-K window count used for focused rows (see + // IBenchmark::samplesForCurrentRow). Default 3 trades 3 * targetBudgetMs + // wall time for jitter-robust comparisons. + uint32_t focusSamples = 3; +}; + +// Pure: parse argv into a ParsedArgs. Unknown flags are silently ignored; +// the caller decides what to do on help / no-baseline / per-load failure. +inline ParsedArgs parseArgs(std::span argv, std::string defaultOutputPath) +{ + ParsedArgs out; + out.outputPath = std::move(defaultOutputPath); + + for (size_t i = 1; i < argv.size(); ++i) + { + if (argv[i] == "--output" && i + 1 < argv.size()) + out.outputPath = argv[++i]; + else if (argv[i] == "--no-baseline") + out.noBaseline = true; + else if (argv[i] == "--no-color") + out.noColor = true; + else if (argv[i] == "--baseline" && i + 1 < argv.size()) + { + const std::string& spec = argv[++i]; + const auto eq = spec.find('='); + std::string label, path; + if (eq == std::string::npos) + { + path = spec; + const auto stem = std::filesystem::path(path).stem().string(); + label = stem.empty() ? std::string("baseline") : stem; + } + else + { + label = spec.substr(0, eq); + path = spec.substr(eq + 1); + } + out.baselines.emplace_back(std::move(label), std::move(path)); + } + else if (argv[i] == "--focus" && i + 1 < argv.size()) + { + out.focus.push_back(splitFocusSpec(argv[++i])); + } + else if (argv[i] == "--focus-samples" && i + 1 < argv.size()) + { + // Clamp to [1, 32]: 1 disables the median+outlier path, 32 is well past + // the point of diminishing returns (variance of the trimmed mean drops + // ~1/sqrt(K)). from_chars instead of stol to stay no-exceptions per + // Nabla style; malformed input leaves the default in place. + const std::string& s = argv[++i]; + long v = 0; + const auto [_, ec] = std::from_chars(s.data(), s.data() + s.size(), v); + if (ec == std::errc{}) + out.focusSamples = uint32_t(std::clamp(v, 1, 32)); + } + else if (argv[i] == "--help" || argv[i] == "-h") + { + out.helpRequested = true; + } + } + return out; +} + +inline void printHelp(nbl::system::ILogger* logger, std::string_view appName, std::string_view defaultOutputPath) +{ + benchLogFmt(logger, nbl::system::ILogger::ELL_INFO, + "{} CLI:\n" + " --output PATH write this run's report to PATH (default: {})\n" + " --baseline [LABEL=]PATH load PATH as a baseline; LABEL becomes the column header ('vs LABEL').\n" + " repeatable. If LABEL= is omitted, the file's stem is used\n" + " (e.g. main.json -> 'main'). '=' is used instead of ':' so Windows\n" + " drive letters in paths don't collide with the separator.\n" + " --no-baseline skip the default auto-load of the output path\n" + " --no-color disable ANSI color in the live table (also honored: NO_COLOR=1 env var)\n" + " --focus NAME print a focused baseline-comparison table for NAME before the run.\n" + " NAME is the hierarchical name with '>' between segments (whitespace\n" + " around '>' is optional). Repeatable; one row per --focus. The first\n" + " loaded baseline is the reference for inline deltas in this table.\n" + " Example: --focus \"Linear > Linear > 1:1\"\n" + " --focus-samples N run each focused row N times (median + outlier rejection) for\n" + " jitter-robust comparisons. Default 3; clamped to [1, 32]. N=1\n" + " matches the rest-phase single-shot path. Wall time per focused\n" + " row scales linearly with N.\n" + " --help, -h print this help\n" + "\n" + "Default behaviour: with no flags, the prior run's output (if present) is loaded as the single\n" + " 'baseline', and a fresh one is written at the end; iterate-and-compare with no flags needed.\n" + "\n" + "Failed loads (missing/corrupt file) log a warning and continue; the corresponding column reads 'n/a'.", + appName, defaultOutputPath); +} + +} + +#endif diff --git a/common/include/nbl/examples/Benchmark/BenchmarkConsole.h b/common/include/nbl/examples/Benchmark/BenchmarkConsole.h new file mode 100644 index 000000000..e857c36d4 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkConsole.h @@ -0,0 +1,526 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_CONSOLE_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_CONSOLE_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Methods templated on the baselines range must expose `.label` and `.rowsByName`. +class BenchmarkConsole +{ + public: + BenchmarkConsole() + { + // https://no-color.org + if (const char* nc = std::getenv("NO_COLOR"); nc && nc[0] != '\0') + m_useAnsi = false; + } + explicit BenchmarkConsole(nbl::core::smart_refctd_ptr logger) + : BenchmarkConsole() + { + m_logger = std::move(logger); + } + + void setLogger(nbl::core::smart_refctd_ptr logger) { m_logger = std::move(logger); } + nbl::system::ILogger* getLogger() const { return m_logger.get(); } + + void setSilent(bool s) { m_silent = s; } + bool silent() const { return m_silent; } + + void setColorEnabled(bool e) { m_useAnsi = e; } + bool colorEnabled() const { return m_useAnsi; } + + // `neutral` is ELL_PERFORMANCE blue (not a full reset) so uncolored cell + // parts inherit the logger's line-wrap color. Only correct because rows / + // banners are all logged at ELL_PERFORMANCE. + struct Ansi + { + static constexpr std::string_view neutral = "\033[34m"; + static constexpr std::string_view reset = "\033[0m"; + static constexpr std::string_view red = "\033[31m"; + static constexpr std::string_view green = "\033[32m"; + static constexpr std::string_view yellow = "\033[33m"; + static constexpr std::string_view cyan = "\033[36m"; + static constexpr std::string_view bold = "\033[1m"; + }; + + // visualWidth excludes ANSI escape bytes (std::format's `{:>{}}` counts + // bytes), so colored cells must be padded manually via padCell. + struct CellOut + { + std::string text; + size_t visualWidth = 0; + }; + + const Format::Widths& widths() const { return m_widths; } + void growWidthFor(std::string_view joined) { m_widths.grow(joined); } + + // Sizes int columns to unchanged-value width, float columns to "value + // (+/-delta)" with delta=0. Changed-int rows overflow; padding every row + // for worst-case wastes ~40% horizontal space on stable runs. + void growForBaseline(const BaselineRow& b) + { + const auto growInt = [&](size_t& w, uint64_t v) + { + if (v == BaselineRow::kAbsent) + return; + w = std::max(w, std::format("{}", v).size()); + }; + growInt(m_widths.regs, b.registerCount); + growInt(m_widths.code, b.codeSizeBytes); + growInt(m_widths.shared, b.sharedMemBytes); + growInt(m_widths.local, b.privateMemBytes); + + if (b.psPerSample > 0.0) + { + m_widths.psSample = std::max(m_widths.psSample, floatCellPlainText(b.psPerSample, 0.0).size()); + const double gsBase = 1000.0 / b.psPerSample; + m_widths.gsamples = std::max(m_widths.gsamples, floatCellPlainText(gsBase, 0.0).size()); + } + } + + // Pre-register so the header (logged once up front) doesn't stay narrower than later rows. + void registerVariant(std::span name) { m_widths.grow(joinName(name)); } + void registerVariant(std::initializer_list name) + { + std::vector tmp; + tmp.reserve(name.size()); + for (auto s : name) + tmp.emplace_back(s); + m_widths.grow(joinName(tmp)); + } + + void logSectionBanner(std::string_view banner) const + { + if (banner.empty()) + return; + if (m_useAnsi) + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}{}{}{}", Ansi::bold, Ansi::cyan, banner, Ansi::reset); + else + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", banner); + } + + // Once per session, not per span, otherwise readers see the same text N times. + template + void logBannerNotes(const Baselines& baselines) const + { + if (std::empty(baselines)) + return; + const auto& primary = *std::begin(baselines); + const bool multi = std::distance(std::begin(baselines), std::end(baselines)) > 1; + const std::string primaryLabel = primary.label; + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: ps/sample lower = faster; GSamples/s higher = faster. Inline annotations compare to primary baseline '{}': " + "floats show 'value (+/-delta)' always; ints show 'old -> new' only when changed.", + primaryLabel); + if (multi) + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: trailing 'vs LABEL' columns carry raw ps/sample deltas against secondary baselines (primary skipped, shown inline)."); + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: '[WG!]' on a delta = baseline's workload shape (workgroup / dispatch / samplesPerDispatch) differs from this run, comparison is apples-to-oranges."); + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "Note: float deltas only get green/red coloring when the relative change is >= {:.0f}% (typical GPU jitter is 1-2%); smaller deltas stay neutral.", + kFloatColorThreshold * 100.0); + } + + template + void logHeader(const Baselines& baselines) const + { + std::string line = std::format("{:<{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}} | {:>{}}", + "Name", m_widths.name, + "ps/sample", m_widths.psSample, + "GSamples/s", m_widths.gsamples, + "regs", m_widths.regs, + "code(B)", m_widths.code, + "shared(B)", m_widths.shared, + "local(B)", m_widths.local); + // Primary is shown inline on every value column; only secondaries get trailing columns. + size_t idx = 0; + for (const auto& b : baselines) + { + if (idx++ == 0) + continue; + const std::string col = std::format("vs {}", b.label); + line += std::format(" | {:>{}}", col, baselineColWidth(b.label)); + } + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line); + } + + template + void logRow(std::span name, std::string_view joinedName, + const TimingResult& t, const PipelineStats& s, + const std::unordered_map& rowBaselines, + const Baselines& baselines) const + { + if (!m_logger || m_silent) + return; + + const BaselineRow* primary = nullptr; + if (!std::empty(baselines)) + { + const std::string key = makeKey(name); + const auto& b0 = *std::begin(baselines); + if (auto it = b0.rowsByName.find(key); it != b0.rowsByName.end()) + primary = &it->second; + } + + // ps_per_sample * GSamples/s == 1000 (see runTimed), so GSamples is derived not stored. + const auto baselineGSamples = primary ? std::optional{primary->psPerSample > 0.0 ? 1000.0 / primary->psPerSample : 0.0} : std::nullopt; + + std::string line = std::format("{:<{}}", joinedName, m_widths.name); + line += " | " + padCell(formatFloatCell(t.ps_per_sample, primary ? std::optional{primary->psPerSample} : std::nullopt, true), m_widths.psSample); + line += " | " + padCell(formatFloatCell(t.gsamples_per_s, baselineGSamples, false), m_widths.gsamples); + line += " | " + padCell(formatIntCell(s.registerCount, primary ? primary->registerCount : BaselineRow::kAbsent), m_widths.regs); + line += " | " + padCell(formatIntCell(s.codeSizeBytes, primary ? primary->codeSizeBytes : BaselineRow::kAbsent), m_widths.code); + line += " | " + padCell(formatIntCell(s.sharedMemBytes, primary ? primary->sharedMemBytes : BaselineRow::kAbsent), m_widths.shared); + line += " | " + padCell(formatIntCell(s.privateMemBytes, primary ? primary->privateMemBytes : BaselineRow::kAbsent), m_widths.local); + + size_t idx = 0; + for (const auto& b : baselines) + { + if (idx++ == 0) + continue; + std::string plain; + bool better = false; + bool significant = false; + bool haveValue = false; + bool flagShape = false; + if (auto it = rowBaselines.find(b.label); it != rowBaselines.end() && it->second.psPerSample > 0.0) + { + const double delta = t.ps_per_sample - it->second.psPerSample; + plain = std::format("{:+.3f}", delta); + better = delta < 0.0; + significant = std::abs(delta) / it->second.psPerSample >= kFloatColorThreshold; + haveValue = true; + flagShape = it->second.shapeMismatch; + } + else + { + plain = "n/a"; + } + std::string suffix = flagShape ? std::string(" [WG!]") : std::string(); + CellOut cell; + cell.visualWidth = plain.size() + suffix.size(); + if (!m_useAnsi) + { + cell.text = plain + suffix; + } + else + { + const bool paint = haveValue && significant; + const std::string_view col = paint ? (better ? Ansi::green : Ansi::red) : std::string_view{}; + std::string coloredPlain = paint + ? std::format("{}{}{}", col, plain, Ansi::neutral) + : plain; + std::string coloredSuffix = flagShape + ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : std::string(); + cell.text = coloredPlain + coloredSuffix; + } + line += " | " + padCell(cell, baselineColWidth(b.label)); + } + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line); + } + + // Flat table, one row per (variant, stat); each baseline gets one delta column: + // + // Name | stat | current | vs iter47 | vs iter48 + // X | ps/sample | 2.151 | -0.044 | +0.123 + // X | GSamples/s | 464.9 | +9.456 | -7.234 + // X | regs | 40 | +0 | +0 + // X | code(B) | 4992 | +128 | 0 + template + void printBaselineComparison(std::span> names, + const Baselines& baselines, const Results& results) const + { + if (!m_logger || names.empty()) + return; + if (std::empty(baselines)) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_WARNING, + "--focus requested {} variant(s) but no baselines are loaded, nothing to compare against. " + "Did your --baseline paths fail to load?", + names.size()); + return; + } + + struct Current + { + TimingResult t; + PipelineStats s; + Workload w; + bool present = false; + }; + std::unordered_map currentByKey; + currentByKey.reserve(std::size(results)); + for (const auto& r : results) + currentByKey[makeKey(r.name)] = {r.timing, r.stats, r.workload, true}; + + const size_t baselineCount = static_cast(std::distance(std::begin(baselines), std::end(baselines))); + + std::vector> rows; + rows.reserve(1 + names.size() * 6); + + { + auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; }; + std::vector header; + header.reserve(3 + baselineCount); + header.push_back(plainCell("Name")); + header.push_back(plainCell("stat")); + header.push_back(plainCell("current")); + for (const auto& b : baselines) + header.push_back(plainCell(std::format("vs {}", b.label))); + rows.push_back(std::move(header)); + } + + auto floatStatRow = [&](const char* label, std::string_view joined, bool have, double curV, + const Workload& curW, const std::string& key, + auto baselineLookup /*BaselineRow -> double*/, bool lowerIsBetter) + { + auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; }; + std::vector row; + row.reserve(3 + baselineCount); + row.push_back(plainCell(std::string(joined))); + row.push_back(plainCell(label)); + row.push_back(have ? plainCell(formatFloat5(curV)) : plainCell("n/a")); + + for (const auto& b : baselines) + { + auto bit = b.rowsByName.find(key); + if (!have || bit == b.rowsByName.end()) + { + row.push_back(plainCell("n/a")); + continue; + } + const double baseV = baselineLookup(bit->second); + if (baseV <= 0.0) + { + row.push_back(plainCell("n/a")); + continue; + } + const bool shapeMismatch = curW.present() && bit->second.workload.present() && (curW.shape != bit->second.workload.shape); + const double delta = curV - baseV; + const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta))); + const bool significant = std::abs(delta) / baseV >= kFloatColorThreshold; + const std::string suffix = shapeMismatch ? std::string(" [WG!]") : std::string(); + CellOut cell; + cell.visualWidth = deltaStr.size() + suffix.size(); + if (!m_useAnsi || !significant) + { + cell.text = m_useAnsi && shapeMismatch + ? std::format("{}{}{}{}{}", deltaStr, Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : deltaStr + suffix; + } + else + { + const bool better = (lowerIsBetter && delta < 0.0) || (!lowerIsBetter && delta > 0.0); + const std::string_view col = better ? Ansi::green : Ansi::red; + std::string coloredDelta = std::format("{}{}{}", col, deltaStr, Ansi::neutral); + std::string coloredSuffix = shapeMismatch + ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : std::string(); + cell.text = coloredDelta + coloredSuffix; + } + row.push_back(std::move(cell)); + } + rows.push_back(std::move(row)); + }; + + auto intStatRow = [&](const char* label, std::string_view joined, bool have, uint64_t curV, + const Workload& curW, const std::string& key, uint64_t BaselineRow::* baseField) + { + auto plainCell = [](std::string s) -> CellOut { const size_t w = s.size(); return {std::move(s), w}; }; + std::vector row; + row.reserve(3 + baselineCount); + row.push_back(plainCell(std::string(joined))); + row.push_back(plainCell(label)); + row.push_back(have ? plainCell(std::format("{}", curV)) : plainCell("n/a")); + + for (const auto& b : baselines) + { + auto bit = b.rowsByName.find(key); + if (!have || bit == b.rowsByName.end()) + { + row.push_back(plainCell("n/a")); + continue; + } + const uint64_t baseV = bit->second.*baseField; + if (baseV == BaselineRow::kAbsent) + { + row.push_back(plainCell("n/a")); + continue; + } + const bool shapeMismatch = curW.present() && bit->second.workload.present() && (curW.shape != bit->second.workload.shape); + const int64_t delta = int64_t(curV) - int64_t(baseV); + const std::string deltaStr = std::format("{:+d}", delta); + const std::string suffix = shapeMismatch ? std::string(" [WG!]") : std::string(); + CellOut cell; + cell.visualWidth = deltaStr.size() + suffix.size(); + if (!m_useAnsi) + { + cell.text = deltaStr + suffix; + } + else + { + std::string coloredDelta = delta != 0 + ? std::format("{}{}{}", Ansi::yellow, deltaStr, Ansi::neutral) + : deltaStr; + std::string coloredSuffix = shapeMismatch + ? std::format("{}{}{}{}", Ansi::bold, Ansi::red, suffix, Ansi::neutral) + : std::string(); + cell.text = coloredDelta + coloredSuffix; + } + row.push_back(std::move(cell)); + } + rows.push_back(std::move(row)); + }; + + for (const auto& nameVec : names) + { + const std::string joined = joinName(nameVec); + const std::string key = makeKey(nameVec); + const auto cit = currentByKey.find(key); + const bool have = (cit != currentByKey.end()) && cit->second.present; + const auto& t = have ? cit->second.t : TimingResult {}; + const auto& s = have ? cit->second.s : PipelineStats {}; + const auto& w = have ? cit->second.w : Workload {}; + + floatStatRow("ps/sample", joined, have, t.ps_per_sample, w, key, + [](const BaselineRow& b) { return b.psPerSample; }, true); + floatStatRow("GSamples/s", joined, have, t.gsamples_per_s, w, key, + [](const BaselineRow& b) { return b.psPerSample > 0.0 ? 1000.0 / b.psPerSample : 0.0; }, false); + intStatRow("regs", joined, have, s.registerCount, w, key, &BaselineRow::registerCount); + intStatRow("code(B)", joined, have, s.codeSizeBytes, w, key, &BaselineRow::codeSizeBytes); + intStatRow("shared(B)", joined, have, s.sharedMemBytes, w, key, &BaselineRow::sharedMemBytes); + intStatRow("local(B)", joined, have, s.privateMemBytes, w, key, &BaselineRow::privateMemBytes); + } + + const size_t nCols = 3 + baselineCount; + std::vector colWidths(nCols, 0); + for (const auto& r : rows) + for (size_t i = 0; i < r.size() && i < nCols; ++i) + colWidths[i] = std::max(colWidths[i], r[i].visualWidth); + + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, + "=== Focus comparison ({} variant(s) vs {} baseline(s); ps/sample lower is better, integer deltas are absolute) ===", + names.size(), baselineCount); + auto leftPad = [](const CellOut& c, size_t targetWidth) -> std::string + { + if (c.visualWidth >= targetWidth) + return c.text; + return c.text + std::string(targetWidth - c.visualWidth, ' '); + }; + for (size_t ri = 0; ri < rows.size(); ++ri) + { + std::string line; + for (size_t ci = 0; ci < rows[ri].size(); ++ci) + { + if (ci) + line.append(" | "); + if (ci <= 1) + line += leftPad(rows[ri][ci], colWidths[ci]); + else + line += padCell(rows[ri][ci], colWidths[ci]); + } + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_PERFORMANCE, "{}", line); + } + } + + private: + static constexpr size_t kBaselineMinColWidth = 10; + size_t baselineColWidth(std::string_view label) const + { + return std::max(kBaselineMinColWidth, std::string_view("vs ").size() + label.size()); + } + + // Typical GPU jitter is 1-2%; coloring below 5% would mostly highlight noise. + static constexpr double kFloatColorThreshold = 0.05; + + // std::format counts ANSI escape bytes, so `{:>N}` can't pad colored cells. + std::string padCell(const CellOut& c, size_t targetWidth) const + { + if (c.visualWidth >= targetWidth) + return c.text; + return std::string(targetWidth - c.visualWidth, ' ') + c.text; + } + + // "regs 40 -> 54" is more useful than "+14 from somewhere", show both endpoints. + CellOut formatIntCell(uint64_t current, uint64_t baseline) const + { + if (baseline == BaselineRow::kAbsent || baseline == current) + { + auto s = std::format("{}", current); + const size_t w = s.size(); + return {std::move(s), w}; + } + const std::string baseStr = std::format("{}", baseline); + const std::string curStr = std::format("{}", current); + const std::string plain = std::format("{} -> {}", baseStr, curStr); + const size_t visW = plain.size(); + if (!m_useAnsi) + return {plain, visW}; + auto colored = std::format("{}{} -> {}{}", Ansi::yellow, baseStr, curStr, Ansi::neutral); + return {std::move(colored), visW}; + } + + // ~5 chars including the decimal point, so column widths stay predictable + // across ps/sample (0.5..100) and GSamples/s (0.03..1000+). + static std::string formatFloat5(double v) + { + const double mag = std::abs(v); + if (mag >= 10000.0) return std::format("{:.0f}", v); + if (mag >= 1000.0) return std::format("{:.1f}", v); + if (mag >= 100.0) return std::format("{:.1f}", v); + if (mag >= 10.0) return std::format("{:.2f}", v); + return std::format("{:.3f}", v); + } + + static std::string floatCellPlainText(double value, double delta) + { + const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta))); + return std::format("{} ({})", formatFloat5(value), deltaStr); + } + + CellOut formatFloatCell(double current, std::optional baseline, bool lowerIsBetter) const + { + if (!baseline.has_value() || *baseline <= 0.0) + { + auto s = formatFloat5(current); + const size_t w = s.size(); + return {std::move(s), w}; + } + const double delta = current - *baseline; + const std::string plain = floatCellPlainText(current, delta); + const size_t visW = plain.size(); + const bool significant = std::abs(delta) / *baseline >= kFloatColorThreshold; + if (!m_useAnsi || !significant) + return {plain, visW}; + const std::string valStr = formatFloat5(current); + const std::string deltaStr = std::format("{}{}", delta >= 0 ? "+" : "-", formatFloat5(std::abs(delta))); + const bool better = (lowerIsBetter && delta < 0.0) || (!lowerIsBetter && delta > 0.0); + const std::string_view color = better ? Ansi::green : Ansi::red; + auto colored = std::format("{} ({}{}{})", valStr, color, deltaStr, Ansi::neutral); + return {std::move(colored), visW}; + } + + nbl::core::smart_refctd_ptr m_logger; + Format::Widths m_widths; + bool m_silent = false; + bool m_useAnsi = true; +}; + +#endif diff --git a/common/include/nbl/examples/Benchmark/BenchmarkJson.h b/common/include/nbl/examples/Benchmark/BenchmarkJson.h new file mode 100644 index 000000000..e6d3fff24 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkJson.h @@ -0,0 +1,306 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_JSON_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_JSON_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace benchmark_json +{ + +// Builds the "device" JSON object from a physical device, or null if dev is null. +inline nlohmann::json buildDeviceMetadata(const nbl::video::IPhysicalDevice* dev) +{ + if (!dev) + return nullptr; + const auto& p = dev->getProperties(); + nlohmann::json out = nlohmann::json::object(); + out["name"] = std::string(p.deviceName); + out["vendorID"] = p.vendorID; + out["deviceID"] = p.deviceID; + out["driverID"] = static_cast(p.driverID); + out["driverName"] = std::string(p.driverName); + out["driverInfo"] = std::string(p.driverInfo); + out["driverVersion"] = p.driverVersion; + out["deviceUUID"] = std::vector(p.deviceUUID, p.deviceUUID + 16); + out["driverUUID"] = std::vector(p.driverUUID, p.driverUUID + 16); + return out; +} + +// Parses a JSON report file into a Baseline. Returns nullopt on missing / +// unparseable / empty file. Caller is responsible for appending / replacing +// in their baseline store and for feeding rows into BenchmarkConsole widths. +inline std::optional loadBaselineFile(std::string label, const std::string& path) +{ + std::ifstream f(path); + if (!f.is_open()) + return std::nullopt; + + nlohmann::json j; + try + { + f >> j; + } + catch (const std::exception&) + { + return std::nullopt; + } + + const auto resultsIt = j.find("results"); + if (resultsIt == j.end() || !resultsIt->is_array()) + return std::nullopt; + + std::unordered_map rowsByName; + for (const auto& r : *resultsIt) + { + const auto n = r.find("name"); + const auto ps = r.find("ps_per_sample"); + if (n == r.end() || ps == r.end()) + continue; + if (!n->is_array() || !ps->is_number()) + continue; + std::vector nameVec; + nameVec.reserve(n->size()); + for (const auto& seg : *n) + { + if (!seg.is_string()) + { + nameVec.clear(); + break; + } + nameVec.emplace_back(seg.get()); + } + if (nameVec.empty()) + continue; + + BaselineRow row; + try + { + row.psPerSample = ps->get(); + } + catch (const std::exception&) + { + continue; + } + + auto readU64 = [&](const char* key, uint64_t& out) + { + const auto it = r.find(key); + if (it != r.end() && it->is_number_unsigned()) + out = it->get(); + }; + readU64("regs", row.registerCount); + readU64("code_bytes", row.codeSizeBytes); + readU64("shared_mem_bytes", row.sharedMemBytes); + readU64("local_mem_bytes", row.privateMemBytes); + readU64("stack_bytes", row.stackBytes); + readU64("subgroup_size", row.subgroupSize); + + auto readUvec3 = [&](const char* key, nbl::hlsl::uint32_t3& out) + { + const auto it = r.find(key); + if (it == r.end() || !it->is_array() || it->size() != 3) + return; + const auto& a = *it; + if (!a[0].is_number_unsigned() || !a[1].is_number_unsigned() || !a[2].is_number_unsigned()) + return; + out.x = a[0].get(); + out.y = a[1].get(); + out.z = a[2].get(); + }; + readUvec3("workgroup_size", row.workload.shape.workgroupSize); + readUvec3("dispatch_groups", row.workload.shape.dispatchGroupCount); + readU64("samples_per_dispatch", row.workload.shape.samplesPerDispatch); + if (const auto it = r.find("bench_dispatches"); it != r.end() && it->is_number_unsigned()) + row.workload.benchDispatches = it->get(); + + rowsByName[makeKey(nameVec)] = row; + } + if (rowsByName.empty()) + return std::nullopt; + + return Baseline {std::move(label), path, j.contains("device") ? j["device"] : nullptr, std::move(rowsByName)}; +} + +// Writes a JSON report. Preserves rows in the prior file whose names weren't +// re-measured this run, so writeReportFile can be an intermediate checkpoint +// during a multi-bench-class session. Returns preservedCount via out-param. +inline bool writeReportFile(const std::string& path, const nlohmann::json& deviceMetadata, const std::vector& baselines, const std::vector& results, nbl::system::ILogger* logger, size_t* outPreservedCount = nullptr) +{ + nlohmann::json doc; + doc["version"] = 1; + + if (!deviceMetadata.is_null()) + doc["device"] = deviceMetadata; + + if (!baselines.empty()) + { + auto& baselinesNode = doc["baselines"] = nlohmann::json::object(); + for (const auto& b : baselines) + baselinesNode[b.label] = b.path; + } + auto& resultsNode = doc["results"] = nlohmann::json::array(); + + std::unordered_set currentKeys; + currentKeys.reserve(results.size()); + for (const auto& r : results) + currentKeys.insert(makeKey(r.name)); + + for (const auto& r : results) + { + nlohmann::json row; + row["name"] = r.name; + row["ps_per_sample"] = r.timing.ps_per_sample; + row["gsamples_per_s"] = r.timing.gsamples_per_s; + row["ms_total"] = r.timing.ms_total; + row["regs"] = r.stats.registerCount; + row["code_bytes"] = r.stats.codeSizeBytes; + row["shared_mem_bytes"] = r.stats.sharedMemBytes; + row["local_mem_bytes"] = r.stats.privateMemBytes; + row["stack_bytes"] = r.stats.stackBytes; + row["subgroup_size"] = r.stats.subgroupSize; + + // Structured so JSON preserves the exact numeric type. + if (!r.stats.unknowns.empty()) + { + using F = nbl::video::IGPUPipelineBase::SExecutableStatistic::FORMAT; + auto& arr = row["unknown_stats"] = nlohmann::json::array(); + for (const auto& s : r.stats.unknowns) + { + nlohmann::json entry; + entry["name"] = s.name; + switch (s.format) + { + case F::BOOL32: + entry["type"] = "bool"; + entry["value"] = s.value.b32; + break; + case F::INT64: + entry["type"] = "int"; + entry["value"] = s.value.i64; + break; + case F::UINT64: + entry["type"] = "uint"; + entry["value"] = s.value.u64; + break; + case F::FLOAT64: + entry["type"] = "float"; + entry["value"] = s.value.f64; + break; + } + arr.push_back(std::move(entry)); + } + } + + row["workgroup_size"] = {r.workload.shape.workgroupSize.x, r.workload.shape.workgroupSize.y, r.workload.shape.workgroupSize.z}; + row["dispatch_groups"] = {r.workload.shape.dispatchGroupCount.x, r.workload.shape.dispatchGroupCount.y, r.workload.shape.dispatchGroupCount.z}; + row["samples_per_dispatch"] = r.workload.shape.samplesPerDispatch; + row["bench_dispatches"] = r.workload.benchDispatches; + + resultsNode.push_back(std::move(row)); + } + + // Caveat: renamed/removed variants linger forever. Delete the output JSON + // to get a clean slate. + size_t preservedCount = 0; + { + std::ifstream in(path); + if (in.is_open()) + { + nlohmann::json existing; + try + { + in >> existing; + } + catch (const std::exception&) + { + existing = nullptr; + } + const auto rIt = existing.find("results"); + if (rIt != existing.end() && rIt->is_array()) + { + for (const auto& priorRow : *rIt) + { + const auto n = priorRow.find("name"); + if (n == priorRow.end() || !n->is_array()) + continue; + std::vector nameVec; + bool ok = true; + for (const auto& seg : *n) + { + if (!seg.is_string()) + { + ok = false; + break; + } + nameVec.emplace_back(seg.get()); + } + if (!ok || nameVec.empty()) + continue; + if (currentKeys.find(makeKey(nameVec)) != currentKeys.end()) + continue; // re-measured this run + + resultsNode.push_back(priorRow); + ++preservedCount; + } + } + } + } + + std::ofstream f(path, std::ios::out | std::ios::trunc); + if (!f.is_open()) + { + benchLogFmt(logger, nbl::system::ILogger::ELL_ERROR, "benchmark_json::writeReportFile: failed to open '{}'", path); + return false; + } + + // One result per line keeps `git diff` showing one row per change instead + // of N lines per row. + f << "{\n"; + f << " \"version\": " << doc["version"].dump() << ",\n"; + if (doc.contains("device")) + { + // Compact value render so byte arrays (deviceUUID etc.) stay inline. + const auto& dev = doc["device"]; + f << " \"device\": {\n"; + bool first = true; + for (auto it = dev.begin(); it != dev.end(); ++it) + { + if (!first) + f << ",\n"; + first = false; + f << " \"" << it.key() << "\": " << it.value().dump(); + } + f << "\n },\n"; + } + if (doc.contains("baselines")) + f << " \"baselines\": " << doc["baselines"].dump() << ",\n"; + f << " \"results\": ["; + for (size_t i = 0; i < resultsNode.size(); ++i) + { + f << (i ? ",\n " : "\n "); + f << resultsNode[i].dump(); + } + f << (resultsNode.empty() ? "]\n" : "\n ]\n"); + f << "}\n"; + + if (outPreservedCount) + *outPreservedCount = preservedCount; + return true; +} + +} // namespace benchmark_json + +#endif diff --git a/common/include/nbl/examples/Benchmark/BenchmarkTypes.h b/common/include/nbl/examples/Benchmark/BenchmarkTypes.h new file mode 100644 index 000000000..274c19514 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/BenchmarkTypes.h @@ -0,0 +1,211 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_BENCHMARK_TYPES_INCLUDED_ +#define _NBL_COMMON_BENCHMARK_TYPES_INCLUDED_ + +#include +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +struct PipelineStats +{ + uint64_t registerCount = 0; + uint64_t codeSizeBytes = 0; + uint64_t sharedMemBytes = 0; + uint64_t privateMemBytes = 0; + uint64_t stackBytes = 0; + uint32_t subgroupSize = 0; + std::string raw; + + // Driver stats matchStat didn't recognise. Structured (not lossy-stringified + // into `raw`) so JSON round-trips the correct numeric type. + std::vector unknowns; +}; + +struct TimingResult +{ + float64_t elapsed_ns = 0.0; + uint64_t totalSamples = 0; + float64_t ps_per_sample = 0.0; + float64_t gsamples_per_s = 0.0; + float64_t ms_total = 0.0; +}; + +struct Format +{ + struct Widths + { + size_t name = std::string_view("Name").size(); + size_t psSample = std::string_view("ps/sample").size(); + size_t gsamples = std::string_view("GSamples/s").size(); + size_t regs = std::string_view("regs").size(); + size_t code = std::string_view("code(B)").size(); + size_t shared = std::string_view("shared(B)").size(); + size_t local = std::string_view("local(B)").size(); + + void grow(std::string_view joinedName) { name = std::max(name, joinedName.size()); } + }; + + static std::string headerBase(const Widths& w = {}) + { + return std::format("{:<{}} | {:>12} | {:>12} | {:>6} | {:>8} | {:>12} | {:>12}", + "Name", w.name, "ps/sample", "GSamples/s", "regs", "code(B)", "shared(B)", "local(B)"); + } + + static std::string dataBase(const Widths& w, std::string_view joinedName, const TimingResult& t, const PipelineStats& s) + { + return std::format("{:<{}} | {:>12.3f} | {:>12.3f} | {:>6} | {:>8} | {:>12} | {:>12}", + joinedName, w.name, t.ps_per_sample, t.gsamples_per_s, s.registerCount, s.codeSizeBytes, s.sharedMemBytes, s.privateMemBytes); + } +}; + +// The "what was measured" part of a workload. Workload (adds benchDispatches) +// and RunContext (adds banner label + budget) both embed a WorkloadShape, so +// the shape can be sliced into either from the other. +struct WorkloadShape +{ + nbl::hlsl::uint32_t3 workgroupSize = {0, 0, 0}; + nbl::hlsl::uint32_t3 dispatchGroupCount = {0, 0, 0}; + uint64_t samplesPerDispatch = 0; + + inline bool operator==(const WorkloadShape& other) const + { + return workgroupSize == other.workgroupSize && dispatchGroupCount == other.dispatchGroupCount && samplesPerDispatch == other.samplesPerDispatch; + } + + inline bool operator!=(const WorkloadShape& other) const + { + return !(*this == other); + } +}; + +struct Workload +{ + WorkloadShape shape; + uint32_t benchDispatches = 0; + + // Default-constructed (all zeros) signals "not recorded". + bool present() const { return shape.samplesPerDispatch != 0; } +}; + +struct BaselineRow +{ + // UINT64_MAX sentinel: no real pipeline stat reaches that magnitude, so an + // "absent" field can't collide with a real value. The current run can also + // produce kAbsent when a driver doesn't expose a given stat. + static constexpr uint64_t kAbsent = std::numeric_limits::max(); + + float64_t psPerSample = 0.0; + uint64_t registerCount = kAbsent; + uint64_t codeSizeBytes = kAbsent; + uint64_t sharedMemBytes = kAbsent; + uint64_t privateMemBytes = kAbsent; + uint64_t stackBytes = kAbsent; + uint64_t subgroupSize = kAbsent; // uint64_t (not 32) to share kAbsent semantics + Workload workload {}; +}; + +// Per-baseline reference for a single row: the baseline's ps/sample plus +// whether its recorded workload shape differs from this run (renders the +// "[WG!]" marker so the reader knows the comparison is questionable). +struct BaselineRef +{ + float64_t psPerSample = 0.0; + bool shapeMismatch = false; +}; + +struct Result +{ + // Hierarchical name, outermost first. Tooling can group by any prefix; the + // console joins with " > ". + nbl::core::vector name; + TimingResult timing {}; + PipelineStats stats {}; + Workload workload {}; + std::unordered_map baselines; +}; + +inline std::string joinName(std::span name, std::string_view sep = " > ") +{ + std::string out; + for (size_t i = 0; i < name.size(); ++i) + { + if (i) + out.append(sep); + out.append(name[i]); + } + return out; +} + +// Unit-separator (\x1f) between segments so makeKey can't collide with any +// user-supplied content. +inline std::string makeKey(std::span name) +{ + std::string k; + size_t total = 0; + for (const auto& s : name) + total += s.size() + 1; + k.reserve(total); + for (size_t i = 0; i < name.size(); ++i) + { + if (i) + k.push_back('\x1f'); + k.append(name[i]); + } + return k; +} + +inline nbl::core::vector splitFocusSpec(std::string_view spec) +{ + auto trim = [](std::string_view s) + { + while (!s.empty() && (s.front() == ' ' || s.front() == '\t')) + s.remove_prefix(1); + while (!s.empty() && (s.back() == ' ' || s.back() == '\t')) + s.remove_suffix(1); + return s; + }; + nbl::core::vector out; + size_t start = 0; + while (start <= spec.size()) + { + size_t end = spec.find('>', start); + if (end == std::string_view::npos) + end = spec.size(); + const auto seg = trim(spec.substr(start, end - start)); + if (!seg.empty()) + out.emplace_back(seg); + if (end == spec.size()) + break; + start = end + 1; + } + return out; +} + +struct Baseline +{ + std::string label; + std::string path; + nlohmann::json device; // top-level "device" field from the file, or null if absent + std::unordered_map rowsByName; // makeKey(name) -> stats +}; + +template +inline void benchLogFmt(nbl::system::ILogger* logger, nbl::system::ILogger::E_LOG_LEVEL level, std::string_view fmt, const Args&... args) +{ + if (!logger) + return; + logger->log("%s", level, std::vformat(fmt, std::make_format_args(args...)).c_str()); +} + +#endif diff --git a/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h new file mode 100644 index 000000000..553e5a21b --- /dev/null +++ b/common/include/nbl/examples/Benchmark/GPUBenchmarkHelper.h @@ -0,0 +1,784 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_GPU_BENCHMARK_HELPER_INCLUDED_ +#define _NBL_COMMON_GPU_BENCHMARK_HELPER_INCLUDED_ + +#include +#include "nbl/examples/examples.hpp" +#include "nbl/examples/Benchmark/BenchmarkTypes.h" +#include "nbl/asset/utils/CCompilerSet.h" +#include "nbl/asset/utils/IShaderCompiler.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class GPUBenchmarkHelper +{ +public: + struct InitData + { + nbl::core::smart_refctd_ptr device; + nbl::core::smart_refctd_ptr logger; + nbl::video::IPhysicalDevice* physicalDevice = nullptr; + uint32_t computeFamilyIndex = 0; + nbl::hlsl::uint32_t3 dispatchGroupCount = {0, 0, 0}; + uint64_t samplesPerDispatch = 0; + }; + + // One shader source for a benchmark variant. Picks ONE of two paths: + // * Precompiled: `precompiledKey` is a SPIRV asset key from CMake-time + // NBL_CREATE_NSC_COMPILE_RULES. `defines` is ignored. + // * Runtime: `sourcePath` is an .hlsl file resolved against "app_resources", + // compiled at load time with `defines` as -D macros. Use this for fast + // variant iteration without reconfiguring CMake. + struct ShaderVariant + { + // SMacroDefinition uses string_view; this struct owns the backing strings. + struct Define + { + std::string identifier; + std::string definition; + }; + + std::string sourcePath; + std::string precompiledKey; + std::vector defines; + nbl::asset::IShader::E_SHADER_STAGE stage = nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE; + + static ShaderVariant Precompiled(std::string key) + { + ShaderVariant v; + v.precompiledKey = std::move(key); + return v; + } + static ShaderVariant FromSource(std::string path, std::vector defs = {}, nbl::asset::IShader::E_SHADER_STAGE stage = nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE) + { + ShaderVariant v; + v.sourcePath = std::move(path); + v.defines = std::move(defs); + v.stage = stage; + return v; + } + + bool isRuntime() const { return !sourcePath.empty() && precompiledKey.empty(); } + bool isPrecompiled() const { return !precompiledKey.empty(); } + }; + + // Logical layout: [warmup x dispatchOne][ts0][bench x dispatchOne][ts1][cooldown x dispatchOne] + // Warmup/cooldown can be split into shorter submissions and the measured window stays intact. + // Putting binds inside dispatchOne adds per-iteration cmdbuf overhead that + // shows up in ps/sample on tight shaders. + using DispatchFn = std::function; + + // Input choice for createBindings(). Output is always implicit BDA. + enum class InputBuffer : uint8_t + { + None, + BDA, + SSBO, + UBO, + }; + + struct BindingsConfig + { + size_t outputBytes = 0; + size_t pushConstantBytes = 0; + size_t inputBytes = 0; + InputBuffer inputMode = InputBuffer::None; + }; + + struct Bindings + { + nbl::core::smart_refctd_ptr outputBuf; + uint64_t outputAddress = 0; + nbl::core::smart_refctd_ptr pipelineLayout; + + nbl::core::smart_refctd_ptr inputBuf; + uint64_t inputAddress = 0; // BDA mode only + + nbl::core::smart_refctd_ptr dsLayout; + nbl::core::smart_refctd_ptr ds; + }; + + struct PipelineEntry + { + nbl::core::smart_refctd_ptr pipeline; + nbl::core::smart_refctd_ptr layout; + PipelineStats stats; + std::string tag; + }; + + // Common bindOnce body: bind pipeline + upload push constants. Most benches + // have nothing else in bindOnce; the few that bind descriptor sets too call + // cb->bindDescriptorSets() before/after this. + template + static void defaultBindAndPush(nbl::video::IGPUCommandBuffer* cb, const PipelineEntry& pe, const PC& pc) + { + cb->bindComputePipeline(pe.pipeline.get()); + cb->pushConstants(pe.layout.get(), nbl::asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(PC), &pc); + } + + // Dispatch using m_dispatchGroupCount (the setup-time shape). + void defaultDispatch(nbl::video::IGPUCommandBuffer* cb) const + { + cb->dispatch(m_dispatchGroupCount.x, m_dispatchGroupCount.y, m_dispatchGroupCount.z); + } + + bool init(const InitData& data) + { + m_device = data.device; + m_logger = data.logger; + m_physicalDevice = data.physicalDevice; + m_queue = m_device->getQueue(data.computeFamilyIndex, 0); + m_dispatchGroupCount = data.dispatchGroupCount; + m_samplesPerDispatch = data.samplesPerDispatch; + + m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, + nbl::video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(nbl::video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to create cmdbuf"); + return false; + } + + nbl::video::IQueryPool::SCreationParams qparams = {}; + qparams.queryType = nbl::video::IQueryPool::TYPE::TIMESTAMP; + qparams.queryCount = 2; + qparams.pipelineStatisticsFlags = nbl::video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(qparams); + if (!m_queryPool) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to create timestamp query pool"); + return false; + } + return true; + } + + // Load (precompiled path) or load+compile (runtime path) a variant's SPIRV. + nbl::core::smart_refctd_ptr loadShader(const ShaderVariant& variant, nbl::core::smart_refctd_ptr assetMgr) const + { + using namespace nbl; + if (!variant.isRuntime() && !variant.isPrecompiled()) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: variant has neither sourcePath nor precompiledKey"); + return nullptr; + } + + asset::IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + + std::string key; + if (variant.isPrecompiled()) + { + lp.workingDirectory = "app_resources"; + key = variant.precompiledKey; + } + else + { + lp.workingDirectory = ""; + key = "app_resources/" + variant.sourcePath; + } + auto bundle = assetMgr->getAsset(key, lp); + const auto assets = bundle.getContents(); + if (assets.empty()) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: failed to load '{}'", key); + return nullptr; + } + auto source = asset::IAsset::castDown(assets[0]); + if (!source) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: '{}' is not an IShader asset", key); + return nullptr; + } + + if (variant.isPrecompiled()) + return source; + + auto* compilerSet = assetMgr->getCompilerSet(); + auto compiler = compilerSet->getShaderCompiler(source->getContentType()); + if (!compiler) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: no compiler for content type of '{}'", variant.sourcePath); + return nullptr; + } + + std::vector wireDefines; + wireDefines.reserve(variant.defines.size()); + for (const auto& d : variant.defines) + wireDefines.push_back({d.identifier, d.definition}); + + asset::IShaderCompiler::SCompilerOptions options = {}; + options.stage = variant.stage; + options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); + options.preprocessorOptions.extraDefines = {wireDefines.data(), wireDefines.size()}; + + auto spirv = compilerSet->compileToSPIRV(source.get(), options); + if (!spirv) + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "GPUBenchmarkHelper::loadShader: runtime compile failed for '{}'", variant.sourcePath); + return spirv; + } + + nbl::core::smart_refctd_ptr allocateDeviceLocalBuffer(nbl::video::IGPUBuffer::SCreationParams bp, const char* label, + nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS allocFlags = nbl::video::IDeviceMemoryAllocation::EMAF_NONE) + { + auto buf = m_device->createBuffer(std::move(bp)); + auto reqs = buf->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits(); + auto alloc = m_device->allocate(reqs, buf.get(), allocFlags); + if (!alloc.isValid()) + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "GPUBenchmarkHelper: failed to allocate {}", label); + return buf; + } + + struct SingleBindingDS + { + nbl::core::smart_refctd_ptr layout; + nbl::core::smart_refctd_ptr set; + }; + + SingleBindingDS createSingleBindingDS( + nbl::core::smart_refctd_ptr buffer, + nbl::asset::IDescriptor::E_TYPE type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + uint32_t binding = 0, + nbl::hlsl::ShaderStage stages = nbl::hlsl::ShaderStage::ESS_COMPUTE) + { + using namespace nbl; + const size_t bufferBytes = buffer->getSize(); + + video::IGPUDescriptorSetLayout::SBinding b = { + .binding = binding, + .type = type, + .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = stages, + .count = 1, + }; + SingleBindingDS out; + out.layout = m_device->createDescriptorSetLayout({&b, 1}); + auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, {&out.layout.get(), 1}); + out.set = pool->createDescriptorSet(core::smart_refctd_ptr(out.layout)); + + video::IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = std::move(buffer); + info.info.buffer = {.offset = 0, .size = bufferBytes}; + video::IGPUDescriptorSet::SWriteDescriptorSet w = { + .dstSet = out.set.get(), + .binding = binding, + .arrayElement = 0, + .count = 1, + .info = &info, + }; + m_device->updateDescriptorSets({&w, 1}, {}); + return out; + } + + nbl::core::smart_refctd_ptr createOutputBuffer( + size_t bytes, + nbl::core::bitflag extraUsage = nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_NONE, + nbl::video::IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS allocFlags = nbl::video::IDeviceMemoryAllocation::EMAF_NONE) + { + nbl::video::IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = nbl::core::bitflag(nbl::video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | extraUsage; + return allocateDeviceLocalBuffer(std::move(bp), "output buffer", allocFlags); + } + + // Buffer must have been created with EUF_TRANSFER_DST_BIT. + void submitFillZero(nbl::core::smart_refctd_ptr buf, size_t bytes) const + { + nbl::core::smart_refctd_ptr initCmdbuf; + m_cmdpool->createCommandBuffers(nbl::video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf); + initCmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + const nbl::asset::SBufferRange range = {.offset = 0, .size = bytes, .buffer = std::move(buf)}; + initCmdbuf->fillBuffer(range, 0u); + initCmdbuf->end(); + + const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = {{.cmdbuf = initCmdbuf.get()}}; + nbl::video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmds; + m_queue->submit({&submit, 1u}); + m_device->waitIdle(); + } + + nbl::core::smart_refctd_ptr createInputBufferZeroFilled(size_t bytes) + { + auto buf = createOutputBuffer(bytes, nbl::video::IGPUBuffer::EUF_TRANSFER_DST_BIT); + if (buf) + submitFillZero(buf, bytes); + return buf; + } + + // BDA buffer staged into device-local VRAM via IUtilities. + nbl::core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) + { + using namespace nbl; + if (!m_utils) + m_utils = video::IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger)); + + video::IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | video::IGPUBuffer::EUF_TRANSFER_DST_BIT; + core::smart_refctd_ptr buf; + auto future = m_utils->createFilledDeviceLocalBufferOnDedMem( + video::SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData); + future.move_into(buf); + return buf; + } + + uint32_t createPipeline(const ShaderVariant& variant, + nbl::core::smart_refctd_ptr assetMgr, + size_t pushConstantSize, + std::string tag = "", + nbl::core::smart_refctd_ptr dsLayout = nullptr) + { + using namespace nbl; + PipelineEntry slot = {.tag = tag}; + + const asset::SPushConstantRange pcRange = { + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = uint32_t(pushConstantSize), + }; + auto layout = dsLayout + ? m_device->createPipelineLayout({&pcRange, 1}, core::smart_refctd_ptr(dsLayout)) + : m_device->createPipelineLayout({&pcRange, 1}); + if (!layout) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): pipeline layout creation failed", tag); + return InvalidPipelineIndex; + } + + auto source = loadShader(variant, std::move(assetMgr)); + auto shader = source ? m_device->compileShader({.source = source.get()}) : nullptr; + if (!shader) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): shader load/compile failed", tag); + return InvalidPipelineIndex; + } + + video::IGPUComputePipeline::SCreationParams pp = {}; + pp.layout = layout.get(); + pp.shader.shader = shader.get(); + pp.shader.entryPoint = "main"; + if (m_device->getEnabledFeatures().pipelineExecutableInfo) + pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline) || !pipeline) + { + benchLogFmt(m_logger.get(), system::ILogger::ELL_ERROR, "createPipeline({}): createComputePipelines failed", tag); + return InvalidPipelineIndex; + } + + if (m_device->getEnabledFeatures().pipelineExecutableInfo) + { + auto infos = pipeline->getExecutableInfo(); + slot.stats.raw = nbl::system::to_string(infos); + + uint64_t vgpr = 0, sgpr = 0; + for (const auto& info : infos) + { + if (info.subgroupSize) + slot.stats.subgroupSize = std::max(slot.stats.subgroupSize, info.subgroupSize); + for (const auto& stat : info.structuredStatistics) + matchStat(stat, slot.stats, vgpr, sgpr); + } + // AMD-style drivers expose VGPR/SGPR separately without a combined + // register count, so fall back to the sum. + if (slot.stats.registerCount == 0 && (vgpr || sgpr)) + slot.stats.registerCount = vgpr + sgpr; + + if (!slot.stats.raw.empty()) + benchLogFmt(m_logger.get(), system::ILogger::ELL_PERFORMANCE, "{} pipeline executable report:\n{}", tag, slot.stats.raw); + } + + slot.layout = std::move(layout); + slot.pipeline = std::move(pipeline); + const uint32_t idx = uint32_t(m_pipelines.size()); + m_pipelines.push_back(std::move(slot)); + return idx; + } + + Bindings createBindings(const BindingsConfig& cfg) + { + using namespace nbl; + Bindings out; + + out.outputBuf = createOutputBuffer(cfg.outputBytes, video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT, video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + out.outputAddress = out.outputBuf->getDeviceAddress(); + + if (cfg.inputMode != InputBuffer::None && cfg.inputBytes > 0) + { + const bool useBDA = cfg.inputMode == InputBuffer::BDA; + const bool useUBO = cfg.inputMode == InputBuffer::UBO; + const bool useSSBO = cfg.inputMode == InputBuffer::SSBO; + + video::IGPUBuffer::SCreationParams bp = {}; + bp.size = cfg.inputBytes; + bp.usage = core::bitflag(video::IGPUBuffer::EUF_TRANSFER_DST_BIT); + if (useBDA || useSSBO) + bp.usage |= video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + if (useBDA) + bp.usage |= video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + if (useUBO) + bp.usage |= video::IGPUBuffer::EUF_UNIFORM_BUFFER_BIT; + + out.inputBuf = allocateDeviceLocalBuffer(std::move(bp), "input buffer", + useBDA ? video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT : video::IDeviceMemoryAllocation::EMAF_NONE); + + if (useBDA) + out.inputAddress = out.inputBuf->getDeviceAddress(); + + submitFillZero(out.inputBuf, cfg.inputBytes); + + if (useSSBO || useUBO) + { + video::IGPUDescriptorSetLayout::SBinding b = { + .binding = 0, + .type = useSSBO ? asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER : asset::IDescriptor::E_TYPE::ET_UNIFORM_BUFFER, + .createFlags = video::IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = nbl::hlsl::ShaderStage::ESS_COMPUTE, + .count = 1, + }; + out.dsLayout = m_device->createDescriptorSetLayout({&b, 1}); + + auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, {&out.dsLayout.get(), 1}); + out.ds = pool->createDescriptorSet(core::smart_refctd_ptr(out.dsLayout)); + + video::IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = core::smart_refctd_ptr(out.inputBuf); + info.info.buffer = {.offset = 0, .size = cfg.inputBytes}; + video::IGPUDescriptorSet::SWriteDescriptorSet w = { + .dstSet = out.ds.get(), + .binding = 0, + .arrayElement = 0, + .count = 1, + .info = &info, + }; + m_device->updateDescriptorSets({&w, 1}, {}); + } + } + + { + const asset::SPushConstantRange pc = { + .stageFlags = nbl::hlsl::ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = uint32_t(cfg.pushConstantBytes), + }; + std::span pcRange = cfg.pushConstantBytes > 0 ? std::span(&pc, 1) : std::span {}; + + if (out.dsLayout) + out.pipelineLayout = m_device->createPipelineLayout(pcRange, core::smart_refctd_ptr(out.dsLayout)); + else + out.pipelineLayout = m_device->createPipelineLayout(pcRange); + } + + return out; + } + + struct BdaBuffer + { + nbl::core::smart_refctd_ptr buf; + uint64_t address = 0; + }; + + BdaBuffer createBdaOutputBuffer(size_t bytes) + { + BdaBuffer out; + out.buf = createOutputBuffer(bytes, nbl::video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT, nbl::video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + out.address = out.buf ? out.buf->getDeviceAddress() : 0; + return out; + } + + // Auto-sizes the dispatch count so the measured window covers ~targetBudgetMs + // of GPU work. Pilots with a small N, then either scales to the budget or + // doubles when the pilot is too noisy (sub-millisecond) to extrapolate. + // + // `samples` controls jitter robustness: values >1 take K independent + // budget-sized timing windows and return the MEDIAN window, costing ~K * + // targetBudgetMs of wall time. Median (not min) is used because GPU + // measurement noise can be two-sided in practice. + TimingResult runTimedBudgeted(uint32_t warmupDispatches, uint64_t targetBudgetMs, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t samples) + { + const uint64_t targetBudgetNs = targetBudgetMs * 1'000'000ull; + constexpr uint32_t kPilotN = 64; + constexpr uint32_t kMaxN = 1u << 24; // safety cap for ultra-fast shaders + uint32_t dispatchesPerSubmit = 1u; + TimingResult r = runTimed(warmupDispatches, kPilotN, bindOnce, dispatchOne, dispatchesPerSubmit); + dispatchesPerSubmit = estimateDispatchesPerSubmit(r, kPilotN); + uint32_t lastN = kPilotN; + while (r.elapsed_ns > targetBudgetNs && lastN > 1u) + { + const double scale = double(targetBudgetNs) / r.elapsed_ns; + uint32_t nextN = uint32_t(std::max(1.0, std::floor(double(lastN) * scale))); + if (nextN >= lastN) + nextN = lastN - 1u; + + r = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne, dispatchesPerSubmit); + dispatchesPerSubmit = estimateDispatchesPerSubmit(r, nextN); + lastN = nextN; + } + + while (r.elapsed_ns < targetBudgetNs && lastN < kMaxN) + { + uint32_t nextN; + if (r.elapsed_ns > 1'000'000ull) // > 1 ms, stable enough to scale + { + const double scale = double(targetBudgetNs) / double(r.elapsed_ns); + nextN = uint32_t(std::min(double(kMaxN), std::ceil(double(lastN) * scale))); + } + else + { + nextN = std::min(kMaxN, lastN * 2); + } + if (nextN <= lastN) + break; // converged + r = runTimed(warmupDispatches, nextN, bindOnce, dispatchOne, dispatchesPerSubmit); + dispatchesPerSubmit = estimateDispatchesPerSubmit(r, nextN); + lastN = nextN; + } + + if (samples <= 1) + return r; + + // Reuse the convergence's final measurement as one of the K samples + // (it's already a budget-sized window at lastN). Run K-1 more at the + // same N. All windows measure the same dispatch count, so the per-window + // elapsed_ns values are directly comparable. + std::vector ns; + ns.reserve(samples); + ns.push_back(r.elapsed_ns); + for (uint32_t i = 1; i < samples; ++i) + { + const TimingResult ri = runTimed(warmupDispatches, lastN, bindOnce, dispatchOne, dispatchesPerSubmit); + ns.push_back(ri.elapsed_ns); + } + std::sort(ns.begin(), ns.end()); + + // Outlier rejection: GPU jitter is usually a one-sided spike + const double median = ns[ns.size() / 2]; + const double dLow = median - ns.front(); + const double dHigh = ns.back() - median; + const double dCloser = std::min(dLow, dHigh); + const double dFar = std::max(dLow, dHigh); + size_t lo = 0; + size_t hi = ns.size(); + if (dCloser > 0.0 && dFar > 2.0 * dCloser) + { + if (dHigh > dLow) + --hi; // top sample is the spike + else + ++lo; // bottom sample is the spike (rare on GPU but cheap to handle) + } + + double sum = 0.0; + for (size_t i = lo; i < hi; ++i) + sum += ns[i]; + const double resultNs = sum / double(hi - lo); + + TimingResult m {}; + m.elapsed_ns = resultNs; + m.totalSamples = uint64_t(lastN) * m_samplesPerDispatch; + m.ps_per_sample = m.totalSamples ? resultNs * 1e3 / double(m.totalSamples) : 0.0; + m.gsamples_per_s = resultNs > 0.0 ? double(m.totalSamples) / resultNs : 0.0; + m.ms_total = resultNs * 1e-6; + return m; + } + + TimingResult runTimed(uint32_t warmupDispatches, uint32_t benchDispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t maxDispatchesPerSubmit) + { + if (m_device->waitIdle() != nbl::video::IQueue::RESULT::SUCCESS) + return {}; + + const uint32_t cooldownDispatches = warmupDispatches; + + if (!runUntimedDispatches(warmupDispatches, bindOnce, dispatchOne, maxDispatchesPerSubmit)) + return {}; + + double elapsedNs = 0.0; + uint32_t remaining = benchDispatches; + while (remaining > 0u) + { + const uint32_t batch = std::min(remaining, std::max(1u, maxDispatchesPerSubmit)); + + m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + + if (bindOnce) + bindOnce(m_cmdbuf.get()); + + m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (uint32_t i = 0u; i < batch; ++i) + dispatchOne(m_cmdbuf.get()); + m_cmdbuf->writeTimestamp(nbl::asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + m_cmdbuf->end(); + + if (!submitAndWait()) + return {}; + + uint64_t timestamps[2] = {}; + const auto flags = nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::_64_BIT) | nbl::core::bitflag(nbl::video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); + if (!m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags)) + return {}; + + const double timestampPeriod = double(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + elapsedNs += double(timestamps[1] - timestamps[0]) * timestampPeriod; + remaining -= batch; + } + + if (!runUntimedDispatches(cooldownDispatches, bindOnce, dispatchOne, maxDispatchesPerSubmit)) + return {}; + + TimingResult r {}; + r.elapsed_ns = elapsedNs; + r.totalSamples = uint64_t(benchDispatches) * m_samplesPerDispatch; + r.ps_per_sample = r.totalSamples ? r.elapsed_ns * 1e3 / double(r.totalSamples) : 0.0; + r.gsamples_per_s = r.elapsed_ns > 0.0 ? double(r.totalSamples) / r.elapsed_ns : 0.0; + r.ms_total = r.elapsed_ns * 1e-6; + return r; + } + +protected: + static constexpr uint32_t InvalidPipelineIndex = std::numeric_limits::max(); + + const PipelineEntry* getPipelineEntry(uint32_t idx, std::string_view context) const + { + if (idx == InvalidPipelineIndex || idx >= m_pipelines.size() || !m_pipelines[idx].pipeline) + { + benchLogFmt(m_logger.get(), nbl::system::ILogger::ELL_ERROR, "{}: pipeline is not available", context); + return nullptr; + } + return &m_pipelines[idx]; + } + + std::vector m_pipelines; + +private: + // Soft target for one queue submit, estimated from timings on the current GPU. + // Benchmark budgets still control measured work. This only chunks submits. + static constexpr double SubmitChunkTargetNs = 250'000'000.0; + + static uint32_t estimateDispatchesPerSubmit(const TimingResult& r, uint32_t dispatches) + { + if (dispatches == 0u || r.elapsed_ns <= 0.0) + return 1u; + + const double nsPerDispatch = r.elapsed_ns / double(dispatches); + if (nsPerDispatch <= 0.0) + return 1u; + + const double maxDispatches = std::floor(SubmitChunkTargetNs / nsPerDispatch); + return uint32_t(std::clamp(maxDispatches, 1.0, double(std::numeric_limits::max()))); + } + + bool submitAndWait() + { + auto semaphore = m_device->createSemaphore(0u); + if (!semaphore) + return false; + + const nbl::video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = {{.cmdbuf = m_cmdbuf.get()}}; + const nbl::video::IQueue::SSubmitInfo::SSemaphoreInfo done[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = nbl::asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + nbl::video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmds; + submit.signalSemaphores = done; + if (m_queue->submit({&submit, 1u}) != nbl::video::IQueue::RESULT::SUCCESS) + return false; + + const nbl::video::ISemaphore::SWaitInfo wait[] = {{.semaphore = semaphore.get(), .value = 1u}}; + return m_device->blockForSemaphores(wait) == nbl::video::ISemaphore::WAIT_RESULT::SUCCESS; + } + + bool runUntimedDispatches(uint32_t dispatches, const DispatchFn& bindOnce, const DispatchFn& dispatchOne, uint32_t maxDispatchesPerSubmit) + { + while (dispatches > 0u) + { + const uint32_t batch = std::min(dispatches, std::max(1u, maxDispatchesPerSubmit)); + + m_cmdbuf->reset(nbl::video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(nbl::video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + if (bindOnce) + bindOnce(m_cmdbuf.get()); + for (uint32_t i = 0u; i < batch; ++i) + dispatchOne(m_cmdbuf.get()); + m_cmdbuf->end(); + + if (!submitAndWait()) + return false; + dispatches -= batch; + } + return true; + } + + static void matchStat(const nbl::video::IGPUPipelineBase::SExecutableStatistic& stat, PipelineStats& out, uint64_t& vgpr, uint64_t& sgpr) + { + const uint64_t v = stat.asUint(); + + auto contains = [&](std::string_view kw) + { + const auto it = std::ranges::search(stat.name, kw, + [&](char a, char b) + { return std::tolower(a) == std::tolower(b); }) + .begin(); + return it != stat.name.end(); + }; + + // Order matters: more specific keys first. + + if (contains("subgroup size") || contains("subgroupsize") || contains("warp size") || contains("wave size")) + out.subgroupSize = std::max(out.subgroupSize, uint32_t(v)); + + else if (contains("vgpr")) + vgpr = std::max(vgpr, v); + else if (contains("sgpr")) + sgpr = std::max(sgpr, v); + else if (contains("register")) + out.registerCount = std::max(out.registerCount, v); + + else if (contains("binary size") || contains("binarysize") || contains("codesize") || contains("code size") || contains("isa size")) + out.codeSizeBytes = std::max(out.codeSizeBytes, v); + else if (contains("instructioncount") || contains("instruction count") || contains("numinstructions")) + out.codeSizeBytes = std::max(out.codeSizeBytes, v); // proxy when no byte size + + else if (contains("shared memory") || contains("sharedmemory") || contains("groupshared") || contains("lds")) + out.sharedMemBytes = std::max(out.sharedMemBytes, v); + + else if (contains("stack size") || contains("stacksize")) + out.stackBytes = std::max(out.stackBytes, v); + + else if (contains("local memory") || contains("localmemory") || contains("scratch") || contains("private memory") || contains("privatememory") || contains("stack")) + out.privateMemBytes = std::max(out.privateMemBytes, v); + + // Vendor-specific stats + // get a structured copy so JSON round-trips the right numeric type. + else + out.unknowns.push_back(stat); + } + + nbl::core::smart_refctd_ptr m_device; + nbl::core::smart_refctd_ptr m_logger; + nbl::video::IPhysicalDevice* m_physicalDevice = nullptr; + nbl::video::IQueue* m_queue = nullptr; + nbl::hlsl::uint32_t3 m_dispatchGroupCount {}; + uint64_t m_samplesPerDispatch = 0; + nbl::core::smart_refctd_ptr m_cmdpool; + nbl::core::smart_refctd_ptr m_cmdbuf; + nbl::core::smart_refctd_ptr m_queryPool; + nbl::core::smart_refctd_ptr m_utils; // lazy, only built on first createBdaBuffer call +}; + +#endif diff --git a/common/include/nbl/examples/Benchmark/IBenchmark.h b/common/include/nbl/examples/Benchmark/IBenchmark.h new file mode 100644 index 000000000..93493c2c6 --- /dev/null +++ b/common/include/nbl/examples/Benchmark/IBenchmark.h @@ -0,0 +1,409 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_COMMON_I_BENCHMARK_INCLUDED_ +#define _NBL_COMMON_I_BENCHMARK_INCLUDED_ + +#include +#include "nbl/examples/Benchmark/BenchmarkTypes.h" +#include "nbl/examples/Benchmark/BenchmarkConsole.h" +#include "nbl/examples/Benchmark/GPUBenchmarkHelper.h" +#include "nbl/examples/Benchmark/BenchmarkJson.h" +#include "nbl/examples/Benchmark/BenchmarkCli.h" +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + + +struct RunContext +{ + WorkloadShape shape; + uint64_t targetBudgetMs = 400; // wall-clock budget per row + std::string sectionLabel = "Benchmarks"; +}; + +// Typical use: +// +// Aggregator agg(logger, logicalDevice, physicalDevice, computeFamilyIndex); +// agg.applyCli({.argv = argv, .defaultOutputPath = "Bench.json"}); +// const RunContext myCtx{.shape = ..., .targetBudgetMs = 400, .sectionLabel = "..."}; +// std::vector benches; +// for (...) benches.emplace_back(agg, MyBench::SetupData{...}); +// MyOtherBench other(agg, MyOtherBench::SetupData{...}); +// agg.runSessionAndReport( +// Aggregator::Span{std::span(benches), myCtx}, +// Aggregator::Span{std::span(&other, 1), otherCtx}); +class Aggregator +{ + friend class IBenchmark; + +public: + Aggregator() = default; + + Aggregator(nbl::core::smart_refctd_ptr logger, + nbl::core::smart_refctd_ptr logicalDevice, + nbl::video::IPhysicalDevice* physicalDevice, + uint32_t computeFamilyIndex) + { + m_console.setLogger(std::move(logger)); + m_logicalDevice = std::move(logicalDevice); + m_physicalDevicePtr = physicalDevice; + m_computeFamilyIndex = computeFamilyIndex; + setDevice(physicalDevice); + } + + void setSilent(bool silent) { m_console.setSilent(silent); } + + const nbl::core::smart_refctd_ptr& getLogicalDevice() const { return m_logicalDevice; } + nbl::video::IPhysicalDevice* getPhysicalDevice() const { return m_physicalDevicePtr; } + uint32_t getComputeFamilyIndex() const { return m_computeFamilyIndex; } + nbl::core::smart_refctd_ptr getLogger() const + { + return nbl::core::smart_refctd_ptr(m_console.getLogger()); + } + + bool loadBaseline(std::string label, const std::string& path) + { + auto b = benchmark_json::loadBaselineFile(label, path); + if (!b) + return false; + + for (const auto& [_, row] : b->rowsByName) + m_console.growForBaseline(row); + + // Vector (not map) so delta columns print in load order. + auto it = std::find_if(m_baselines.begin(), m_baselines.end(), + [&](const Baseline& existing) { return existing.label == label; }); + if (it != m_baselines.end()) + *it = std::move(*b); + else + m_baselines.push_back(std::move(*b)); + return true; + } + + bool loadBaseline(const std::string& path) { return loadBaseline("baseline", path); } + + bool writeReport(const std::string& path) + { + size_t preservedCount = 0; + if (!benchmark_json::writeReportFile(path, m_device, m_baselines, m_results, m_console.getLogger(), &preservedCount)) + return false; + + if (preservedCount > 0) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Wrote benchmark report to {} ({} new + {} preserved from prior file)", + path, m_results.size(), preservedCount); + else + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Wrote benchmark report to {} ({} rows)", path, m_results.size()); + return true; + } + + // Captured for the UUID-mismatch warning in applyCli. + void setDevice(const nbl::video::IPhysicalDevice* dev) { m_device = benchmark_json::buildDeviceMetadata(dev); } + + struct CliResult + { + std::string outputPath; + nbl::core::vector> focusVariants; + uint32_t focusSamples = 3; // --focus-samples, see samplesForCurrentRow + + bool isFocused(const nbl::core::vector& name) const + { + return std::ranges::find(focusVariants, name) != focusVariants.end(); + } + }; + + template + struct Span + { + std::span benches; + RunContext context; + }; + + // Two overloads so a single bench doesn't need `std::span(&bench, 1)`. + template + requires requires (Range& r) { std::data(r); std::size(r); } + static auto makeSpan(Range& benches, RunContext context) + { + using T = std::remove_reference_t; + return Span{std::span(std::data(benches), std::size(benches)), std::move(context)}; + } + + template + requires std::derived_from + static Span makeSpan(T& bench, RunContext context) + { + return Span{std::span(&bench, 1), std::move(context)}; + } + + static std::string describe(const RunContext& ctx) + { + const auto& sh = ctx.shape; + const uint32_t wgThreads = sh.workgroupSize.x * sh.workgroupSize.y * sh.workgroupSize.z; + const uint32_t threadsPerDisp = sh.dispatchGroupCount.x * sh.dispatchGroupCount.y * sh.dispatchGroupCount.z * wgThreads; + const uint64_t itersPerThread = threadsPerDisp ? sh.samplesPerDispatch / threadsPerDisp : 0; + const double budgetMs = double(ctx.targetBudgetMs); + return std::format("=== {} (~{:.0f}ms/row, {} threads/dispatch, {} iters/thread; wg={}x{}x{}; ps/sample is per all GPU threads) ===", + ctx.sectionLabel, budgetMs, threadsPerDisp, itersPerThread, sh.workgroupSize.x, sh.workgroupSize.y, sh.workgroupSize.z); + } + + // Order: banner -> focus(spans...) -> comparison table -> banner -> + // column header -> rest(spans...) -> writeReport. + // All focus rows print globally first, then all rest rows; banner printed + // twice so each chunk reads in isolation when scrolling back. + template + requires(std::derived_from && ...) + void runSessionAndReport(Span... spans) + { + // Templated lambda (not `auto& s`) so only Span deduces -- a future + // signature change can't silently start passing arbitrary types through. + auto runSpan = [this](Span& s, bool silent) + { + if (s.benches.empty()) + return; + if (!silent) + { + m_console.logSectionBanner(describe(s.context)); + m_console.logHeader(m_baselines); + } + for (auto& e : s.benches) + e.run(); + // Flush after each rest span: if span N+1 dies mid-way, span N's + // rows are already on disk. Trailing flush is also the final write. + if (!silent) + writeReport(m_cli.outputPath); + }; + + m_console.logBannerNotes(m_baselines); + if (!m_cli.focusVariants.empty()) + { + m_console.setSilent(true); // benches read this to know they're in the focused-rows half + (runSpan(spans, true), ...); + m_console.setSilent(false); + m_console.printBaselineComparison(std::span>(m_focusNames), m_baselines, m_results); + } + (runSpan(spans, false), ...); + } + + struct CliConfig + { + std::span argv; // feed from IApplicationFramework::argv + std::string defaultOutputPath = "Bench.json"; + std::string appName = "benchmark"; + }; + + CliResult applyCli(const CliConfig& cfg) + { + auto parsed = benchmark_cli::parseArgs(cfg.argv, cfg.defaultOutputPath); + if (parsed.helpRequested) + { + benchmark_cli::printHelp(m_console.getLogger(), cfg.appName, cfg.defaultOutputPath); + exit(0); + } + if (parsed.noColor) + m_console.setColorEnabled(false); + + CliResult res; + res.outputPath = parsed.outputPath; + + if (!parsed.baselines.empty()) + { + size_t succeeded = 0; + for (const auto& [label, path] : parsed.baselines) + { + if (loadBaseline(label, path)) + { + ++succeeded; + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Loaded baseline '{}' from {} ({} rows)", label, path, m_baselines.back().rowsByName.size()); + } + else + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "Failed to load baseline '{}' from {}, skipped", label, path); + } + if (succeeded == 0) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "All {} --baseline load(s) failed. delta columns and --focus will be empty. " + "Check the paths above; default auto-load of '{}' is suppressed once any --baseline is specified, " + "drop the --baseline flag(s) or use --no-baseline to silence this warning.", + parsed.baselines.size(), res.outputPath); + else if (succeeded < parsed.baselines.size()) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "{} of {} --baseline load(s) failed; continuing with {} loaded.", + parsed.baselines.size() - succeeded, parsed.baselines.size(), succeeded); + } + else if (!parsed.noBaseline) + { + if (loadBaseline(res.outputPath)) + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "Loaded baseline from {} ({} rows)", res.outputPath, + m_baselines.empty() ? size_t {0} : m_baselines.back().rowsByName.size()); + else + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_INFO, + "No baseline at {}, delta column will read 'n/a'", res.outputPath); + } + + warnDeviceMismatch(); + + res.focusVariants = std::move(parsed.focus); + res.focusSamples = parsed.focusSamples; + m_cli = res; + return res; + } + +private: + void warnDeviceMismatch() const + { + if (!m_device.is_object() || !m_device.contains("deviceUUID")) + return; + const auto& currentUUID = m_device["deviceUUID"]; + for (const auto& b : m_baselines) + { + if (!b.device.is_object() || !b.device.contains("deviceUUID")) + continue; + if (b.device["deviceUUID"] == currentUUID) + continue; + const std::string baselineDevName = b.device.value("name", std::string {""}); + const std::string currentDevName = m_device.value("name", std::string {""}); + benchLogFmt(m_console.getLogger(), nbl::system::ILogger::ELL_WARNING, + "Baseline '{}' (from {}) was measured on a different GPU ('{}' vs current '{}'). " + "Delta values will be apples-to-oranges.", + b.label, b.path, baselineDevName, currentDevName); + } + } + + // In focus phase (silent), captures the row's name into m_focusNames so + // runSessionAndReport can build the comparison table without main.cpp + // threading names back through each bench class. + void appendAndLog(Result&& r) + { + const std::string joined = joinName(r.name); + if (!m_baselines.empty()) + { + const std::string key = makeKey(r.name); + for (const auto& b : m_baselines) + { + auto it = b.rowsByName.find(key); + if (it == b.rowsByName.end()) + continue; + const bool shapeMismatch = r.workload.present() && it->second.workload.present() && (r.workload.shape != it->second.workload.shape); + r.baselines[b.label] = {it->second.psPerSample, shapeMismatch}; + } + } + m_console.growWidthFor(joined); + if (m_console.silent()) + m_focusNames.push_back(r.name); + m_results.push_back(std::move(r)); + m_console.logRow(std::span(m_results.back().name), joined, m_results.back().timing, m_results.back().stats, m_results.back().baselines, m_baselines); + } + + std::vector m_results; + std::vector m_baselines; + nbl::core::vector> m_focusNames; + nlohmann::json m_device; + CliResult m_cli; + BenchmarkConsole m_console; + nbl::core::smart_refctd_ptr m_logicalDevice; + nbl::video::IPhysicalDevice* m_physicalDevicePtr = nullptr; + uint32_t m_computeFamilyIndex = 0; +}; + +class IBenchmark +{ +public: + virtual ~IBenchmark() = default; + + // Single-named benches override doRun() and inherit this default filter. + // Sweep-style benches synthesize per-row names; they override run() and + // do per-row filtering themselves. + virtual void run() + { + const bool silent = isFocusPhase(); + const bool inFocus = isFocused(m_name); + const bool shouldRun = silent ? inFocus : !inFocus; + if (shouldRun) + doRun(); + } + + uint32_t getWarmupDispatches() const { return m_warmupDispatches; } + uint64_t getTargetBudgetMs() const { return m_targetBudgetMs; } + const WorkloadShape& getShape() const { return m_workloadShape; } + + // Pass this to runTimedBudgeted so only --focus rows pay the K * budget cost. + uint32_t samplesForCurrentRow() const { return isFocusPhase() ? m_aggregator.m_cli.focusSamples : 1u; } + +protected: + // Banner label is NOT taken here; it belongs to the span (see Aggregator::Span). + IBenchmark(Aggregator& aggregator, core::vector name, uint32_t warmupDispatches, const WorkloadShape& shape, uint64_t targetBudgetMs) + : m_name(std::move(name)) + , m_aggregator(aggregator) + , m_warmupDispatches(warmupDispatches) + , m_targetBudgetMs(targetBudgetMs) + , m_workloadShape(shape) + { + registerVariant(m_name); + } + + virtual void doRun() {} + + bool isFocusPhase() const { return m_aggregator.m_console.silent(); } + bool isFocused(const core::vector& name) const { return m_aggregator.m_cli.isFocused(name); } + void registerVariant(std::span name) { m_aggregator.m_console.registerVariant(name); } + void registerVariant(std::initializer_list name) { m_aggregator.m_console.registerVariant(name); } + + void record(core::vector name, const TimingResult& t, const PipelineStats& s) + { + Workload w{.shape = m_workloadShape}; + w.benchDispatches = w.shape.samplesPerDispatch ? uint32_t(t.totalSamples / w.shape.samplesPerDispatch) : 0; + + Result r; + r.name = std::move(name); + r.timing = t; + r.stats = s; + r.workload = w; + m_aggregator.appendAndLog(std::move(r)); + } + + core::vector m_name; + Aggregator& m_aggregator; // non-owning, outlives this bench + uint32_t m_warmupDispatches; + uint64_t m_targetBudgetMs; + WorkloadShape m_workloadShape; +}; + +class GPUBenchmark : public IBenchmark, public GPUBenchmarkHelper +{ +public: + struct SetupData + { + core::vector name; + uint32_t warmupDispatches = 0; + WorkloadShape shape = {}; + uint64_t targetBudgetMs = 400; + }; + +protected: + GPUBenchmark(Aggregator& aggregator, const SetupData& data) + : IBenchmark(aggregator, data.name, data.warmupDispatches, data.shape, data.targetBudgetMs) + { + GPUBenchmarkHelper::init({ + .device = aggregator.getLogicalDevice(), + .logger = aggregator.getLogger(), + .physicalDevice = aggregator.getPhysicalDevice(), + .computeFamilyIndex = aggregator.getComputeFamilyIndex(), + .dispatchGroupCount = data.shape.dispatchGroupCount, + .samplesPerDispatch = data.shape.samplesPerDispatch, + }); + } +}; + +#endif diff --git a/common/include/nbl/examples/Tester/FailureManifest.h b/common/include/nbl/examples/Tester/FailureManifest.h new file mode 100644 index 000000000..a703e933e --- /dev/null +++ b/common/include/nbl/examples/Tester/FailureManifest.h @@ -0,0 +1,331 @@ +#ifndef _NBL_COMMON_TESTER_FAILURE_MANIFEST_INCLUDED_ +#define _NBL_COMMON_TESTER_FAILURE_MANIFEST_INCLUDED_ + +#include + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nbl::examples::testing +{ + +struct FailureCase +{ + std::string check; + std::string side; + uint64_t iteration = 0; + uint32_t seed = 0; + double maxRelative = 0.0; + double maxAbsolute = 0.0; +}; + +struct FailureGroup +{ + std::string phase; + std::string id; + std::string name; + std::string logFile; + std::vector cases; + uint32_t omittedCases = 0; +}; + +class FailureManifest +{ + public: + explicit FailureManifest(std::string suite = {}) : m_suite(std::move(suite)) {} + + void setSuite(std::string suite) { m_suite = std::move(suite); } + + void addGroupFailure(std::string_view phase, std::string_view id, std::string_view name, std::string_view logFile = {}) + { + auto& group = groupFor(phase, id, name); + if (!logFile.empty()) + group.logFile = std::string(logFile); + } + + void addCase(std::string_view phase, std::string_view id, std::string_view name, std::string_view check, std::string_view side, + uint64_t iteration, uint32_t seed, double maxRelative, double maxAbsolute) + { + auto& group = groupFor(phase, id, name); + if (group.cases.size() >= MaxCasesPerGroup) + { + ++group.omittedCases; + return; + } + + group.cases.push_back(FailureCase{ + .check = std::string(check), + .side = std::string(side), + .iteration = iteration, + .seed = seed, + .maxRelative = maxRelative, + .maxAbsolute = maxAbsolute, + }); + } + + const std::vector& failures() const { return m_failures; } + + nlohmann::json toJson() const + { + nlohmann::json doc; + doc["version"] = 1; + doc["suite"] = m_suite; + auto& failures = doc["failures"] = nlohmann::json::array(); + + for (const auto& group : m_failures) + { + nlohmann::json g; + g["phase"] = group.phase; + g["id"] = group.id; + g["name"] = group.name; + if (!group.logFile.empty()) + g["log_file"] = group.logFile; + + auto& cases = g["cases"] = nlohmann::json::array(); + for (const auto& c : group.cases) + { + nlohmann::json row; + row["check"] = c.check; + row["side"] = c.side; + row["iteration"] = c.iteration; + row["seed"] = c.seed; + row["max_relative"] = c.maxRelative; + row["max_absolute"] = c.maxAbsolute; + cases.push_back(std::move(row)); + } + + if (group.omittedCases > 0) + g["omitted_cases"] = group.omittedCases; + + failures.push_back(std::move(g)); + } + + return doc; + } + + private: + static constexpr size_t MaxCasesPerGroup = 64; + + FailureGroup& groupFor(std::string_view phase, std::string_view id, std::string_view name) + { + const std::string idString(id); + auto it = std::find_if(m_failures.begin(), m_failures.end(), [&](const FailureGroup& g) { return g.id == idString; }); + if (it != m_failures.end()) + { + if (it->name.empty()) + it->name = std::string(name); + if (it->phase.empty()) + it->phase = std::string(phase); + return *it; + } + + m_failures.push_back(FailureGroup{ + .phase = std::string(phase), + .id = idString, + .name = std::string(name), + }); + return m_failures.back(); + } + + std::string m_suite; + std::vector m_failures; +}; + +class TestFilter +{ + public: + bool enabled() const { return m_enabled; } + + void enable() { m_enabled = true; } + + bool shouldRun(std::string_view id) const + { + return !m_enabled || m_ids.contains(std::string(id)); + } + + void add(std::string_view id) + { + m_enabled = true; + const auto first = id.find_first_not_of(" \t\r\n"); + if (first == std::string_view::npos) + return; + const auto last = id.find_last_not_of(" \t\r\n"); + m_ids.insert(std::string(id.substr(first, last - first + 1))); + } + + void addSeed(std::string_view id, uint32_t seed) + { + add(id); + m_seeds[std::string(id)] = seed; + } + + void addList(std::string_view ids) + { + m_enabled = true; + while (!ids.empty()) + { + const auto comma = ids.find(','); + add(ids.substr(0, comma)); + if (comma == std::string_view::npos) + return; + ids.remove_prefix(comma + 1); + } + } + + std::optional seedFor(std::string_view id) const + { + auto it = m_seeds.find(std::string(id)); + if (it == m_seeds.end()) + return {}; + return it->second; + } + + private: + bool m_enabled = false; + std::set m_ids; + std::map m_seeds; +}; + +struct RunControl +{ + bool valid = true; + bool skipBenchmarks = false; + std::string failedOutPath; + TestFilter filter; +}; + +inline bool addFailedIdsFromFile(TestFilter& filter, const std::string& path, nbl::system::ILogger* logger) +{ + filter.enable(); + std::ifstream in(path); + if (!in.is_open()) + { + if (logger) + logger->log("Failed to open failed-test manifest '%s'", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + nlohmann::json doc; + try + { + in >> doc; + } + catch (const std::exception& e) + { + if (logger) + logger->log("Failed to parse failed-test manifest '%s': %s", nbl::system::ILogger::ELL_ERROR, path.c_str(), e.what()); + return false; + } + + const auto failuresIt = doc.find("failures"); + if (failuresIt == doc.end() || !failuresIt->is_array()) + { + if (logger) + logger->log("Failed-test manifest '%s' does not contain a failures array", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + for (const auto& failure : *failuresIt) + { + if (!failure.is_object()) + continue; + const auto idIt = failure.find("id"); + if (idIt != failure.end() && idIt->is_string()) + { + const std::string id = idIt->get(); + const auto casesIt = failure.find("cases"); + if (casesIt != failure.end() && casesIt->is_array()) + { + const auto seedIt = std::find_if(casesIt->begin(), casesIt->end(), [](const nlohmann::json& row) { + if (!row.is_object()) + return false; + const auto it = row.find("seed"); + return it != row.end() && it->is_number_integer(); + }); + if (seedIt != casesIt->end()) + { + filter.addSeed(id, (*seedIt)["seed"].get()); + continue; + } + } + filter.add(id); + } + } + + return true; +} + +inline RunControl parseRunControl(std::span argv, nbl::system::ILogger* logger) +{ + RunControl out; + + for (size_t i = 1; i < argv.size(); ++i) + { + const std::string& arg = argv[i]; + if (arg == "--skip-benchmarks") + out.skipBenchmarks = true; + else if (arg == "--failed-out" && i + 1 < argv.size()) + out.failedOutPath = argv[++i]; + else if (arg.starts_with("--failed-out=")) + out.failedOutPath = arg.substr(std::string("--failed-out=").size()); + else if (arg == "--test" && i + 1 < argv.size()) + out.filter.addList(argv[++i]); + else if (arg.starts_with("--test=")) + out.filter.addList(std::string_view(arg).substr(std::string_view("--test=").size())); + else if (arg == "--rerun-failed" && i + 1 < argv.size()) + { + if (!addFailedIdsFromFile(out.filter, argv[++i], logger)) + out.valid = false; + } + else if (arg.starts_with("--rerun-failed=")) + { + if (!addFailedIdsFromFile(out.filter, arg.substr(std::string("--rerun-failed=").size()), logger)) + out.valid = false; + } + } + + if (out.filter.enabled()) + out.skipBenchmarks = true; + + return out; +} + +inline bool writeFailureManifestFile(const FailureManifest& manifest, const std::string& path, nbl::system::ILogger* logger) +{ + std::ofstream out(path, std::ios::out | std::ios::trunc); + if (!out.is_open()) + { + if (logger) + logger->log("Failed to open failed-test manifest '%s' for writing", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + out << manifest.toJson().dump(3) << '\n'; + if (!out.good()) + { + if (logger) + logger->log("Failed to write failed-test manifest '%s'", nbl::system::ILogger::ELL_ERROR, path.c_str()); + return false; + } + + if (logger) + logger->log("Wrote failed-test manifest '%s' with %llu failed groups", nbl::system::ILogger::ELL_INFO, + path.c_str(), static_cast(manifest.failures().size())); + return true; +} + +} // namespace nbl::examples::testing + +#endif diff --git a/common/include/nbl/examples/Tester/ITester.h b/common/include/nbl/examples/Tester/ITester.h index 8fd4c6639..27dfccff2 100644 --- a/common/include/nbl/examples/Tester/ITester.h +++ b/common/include/nbl/examples/Tester/ITester.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -171,6 +172,7 @@ class ITester bool performTestsAndVerifyResults(const std::string& logFileName) { + m_failureLogFile = logFileName; m_logFile.open(logFileName, std::ios::out | std::ios::trunc); if (!m_logFile.is_open()) m_logger->log("Failed to open log file!", system::ILogger::ELL_ERROR); @@ -197,6 +199,8 @@ class ITester core::vector gpuTestResults = performGpuTests(inputTestValues); bool pass = verifyAllTestResults(cpuTestResults, gpuTestResults, exceptedTestResults); + if (!pass && m_failureManifest) + m_failureManifest->addGroupFailure(m_failurePhase, m_failureId, m_failureName, m_failureLogFile); m_logger->log("TESTS DONE.", system::ILogger::ELL_PERFORMANCE); reloadSeed(); @@ -205,6 +209,20 @@ class ITester return pass; } + void setFailureRecordContext(nbl::examples::testing::FailureManifest* manifest, std::string phase, std::string id, std::string name) + { + m_failureManifest = manifest; + m_failurePhase = std::move(phase); + m_failureId = std::move(id); + m_failureName = std::move(name); + } + + void setSeed(uint32_t seed) + { + m_seed = seed; + m_mersenneTwister = std::mt19937(m_seed); + } + virtual ~ITester() { m_outputBufferAllocation.memory->unmap(); @@ -339,6 +357,13 @@ class ITester ss << " DIFFERENCE: " << system::to_string(hlsl::abs(expectedVal - testVal)); ss << " MAX RELATIVE: " << system::to_string(maxRelativeDifference) << " MAX ABSOLUTE " << system::to_string(maxAbsoluteDifference) << '\n'; + if (m_failureManifest) + { + const char* side = testType == TestType::CPU ? "CPU" : "GPU"; + m_failureManifest->addCase(m_failurePhase, m_failureId, m_failureName, memberName, side, + testIteration, seed, maxRelativeDifference, maxAbsoluteDifference); + } + m_logger->log("%s", system::ILogger::ELL_ERROR, ss.str().c_str()); m_logFile << ss.str() << '\n'; } @@ -439,6 +464,11 @@ class ITester uint32_t m_seed; std::ofstream m_logFile; core::unordered_map m_maxErrors; + nbl::examples::testing::FailureManifest* m_failureManifest = nullptr; + std::string m_failurePhase; + std::string m_failureId; + std::string m_failureName; + std::string m_failureLogFile; }; -#endif \ No newline at end of file +#endif diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp index f185e60f6..8fadbd866 100644 --- a/common/include/nbl/examples/cameras/CCamera.hpp +++ b/common/include/nbl/examples/cameras/CCamera.hpp @@ -16,8 +16,8 @@ #include #include -class Camera -{ +class Camera +{ public: Camera() = default; Camera(const nbl::core::vectorSIMDf& position, const nbl::core::vectorSIMDf& lookat, const nbl::hlsl::float32_t4x4& projection, float moveSpeed = 1.0f, float rotateSpeed = 1.0f, const nbl::core::vectorSIMDf& upVec = nbl::core::vectorSIMDf(0.0f, 1.0f, 0.0f), const nbl::core::vectorSIMDf& backupUpVec = nbl::core::vectorSIMDf(0.5f, 1.0f, 0.0f)) @@ -43,6 +43,8 @@ class Camera enum E_CAMERA_MOVE_KEYS : uint8_t { ECMK_MOVE_FORWARD = 0, + ECMK_MOVE_UP, + ECMK_MOVE_DOWN, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT, @@ -51,6 +53,8 @@ class Camera inline void mapKeysToWASD() { + keysMap[ECMK_MOVE_UP] = nbl::ui::EKC_E; + keysMap[ECMK_MOVE_DOWN] = nbl::ui::EKC_Q; keysMap[ECMK_MOVE_FORWARD] = nbl::ui::EKC_W; keysMap[ECMK_MOVE_BACKWARD] = nbl::ui::EKC_S; keysMap[ECMK_MOVE_LEFT] = nbl::ui::EKC_A; @@ -68,7 +72,7 @@ class Camera inline void mapKeysCustom(std::array& map) { keysMap = map; } inline const nbl::hlsl::float32_t4x4& getProjectionMatrix() const { return projMatrix; } - inline const nbl::hlsl::float32_t3x4& getViewMatrix() const { return viewMatrix; } + inline const nbl::hlsl::float32_t3x4& getViewMatrix() const { return viewMatrix; } inline const nbl::hlsl::float32_t4x4& getConcatenatedMatrix() const { return concatMatrix; } inline void setProjectionMatrix(const nbl::hlsl::float32_t4x4& projection) @@ -77,16 +81,16 @@ class Camera leftHanded = nbl::hlsl::determinant(projMatrix) < 0.f; concatMatrix = nbl::hlsl::math::linalg::promoted_mul(projMatrix, viewMatrix); } - + inline void setPosition(const nbl::core::vectorSIMDf& pos) { position.set(pos); recomputeViewMatrix(); } - + inline const nbl::core::vectorSIMDf& getPosition() const { return position; } - inline void setTarget(const nbl::core::vectorSIMDf& pos) + inline void setTarget(const nbl::core::vectorSIMDf& pos) { target.set(pos); recomputeViewMatrix(); @@ -95,11 +99,11 @@ class Camera inline const nbl::core::vectorSIMDf& getTarget() const { return target; } inline void setUpVector(const nbl::core::vectorSIMDf& up) { upVector = up; } - + inline void setBackupUpVector(const nbl::core::vectorSIMDf& up) { backupUpVector = up; } inline const nbl::core::vectorSIMDf& getUpVector() const { return upVector; } - + inline const nbl::core::vectorSIMDf& getBackupUpVector() const { return backupUpVector; } inline const float getMoveSpeed() const { return moveSpeed; } @@ -110,7 +114,7 @@ class Camera inline void setRotateSpeed(const float _rotateSpeed) { rotateSpeed = _rotateSpeed; } - inline void recomputeViewMatrix() + inline void recomputeViewMatrix() { nbl::hlsl::float32_t3 pos = nbl::core::convertToHLSLVector(position).xyz; nbl::hlsl::float32_t3 localTarget = nbl::hlsl::normalize(nbl::core::convertToHLSLVector(target).xyz - pos); @@ -140,63 +144,78 @@ class Camera void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) { - for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) + for (auto eventIt = events.begin(); eventIt != events.end(); eventIt++) { auto ev = *eventIt; - if(ev.type == nbl::ui::SMouseEvent::EET_CLICK && ev.clickEvent.mouseButton == nbl::ui::EMB_LEFT_BUTTON) - if(ev.clickEvent.action == nbl::ui::SMouseEvent::SClickEvent::EA_PRESSED) + if (ev.type == nbl::ui::SMouseEvent::EET_CLICK && ev.clickEvent.mouseButton == nbl::ui::EMB_LEFT_BUTTON) + if (ev.clickEvent.action == nbl::ui::SMouseEvent::SClickEvent::EA_PRESSED) mouseDown = true; else if (ev.clickEvent.action == nbl::ui::SMouseEvent::SClickEvent::EA_RELEASED) mouseDown = false; - if(ev.type == nbl::ui::SMouseEvent::EET_MOVEMENT && mouseDown) + if (ev.type == nbl::ui::SMouseEvent::EET_MOVEMENT && mouseDown) { - nbl::hlsl::float32_t4 pos = nbl::core::convertToHLSLVector(getPosition()); - nbl::hlsl::float32_t4 localTarget = nbl::core::convertToHLSLVector(getTarget()) - pos; - - // Get Relative Rotation for localTarget in Radians - float relativeRotationX, relativeRotationY; - relativeRotationY = atan2(localTarget.x, localTarget.z); - const double z1 = nbl::core::sqrt(localTarget.x*localTarget.x + localTarget.z*localTarget.z); - relativeRotationX = atan2(z1, localTarget.y) - nbl::core::PI()/2; - - constexpr float RotateSpeedScale = 0.003f; - relativeRotationX -= ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale * -1.0f; - float tmpYRot = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale * -1.0f; - + // --- corrected camera rotation update --- + nbl::hlsl::float32_t3 pos = nbl::core::convertToHLSLVector(getPosition()).xyz; + nbl::hlsl::float32_t3 targetVec = nbl::core::convertToHLSLVector(getTarget()).xyz - pos; // original vector to target + + // preserve distance so we don't collapse to unit length + float targetDistance = nbl::hlsl::length(targetVec); + if (targetDistance < 1e-6f) targetDistance = 1.0f; // avoid div-by-zero + + nbl::hlsl::float32_t3 forward = nbl::hlsl::normalize(targetVec); + nbl::hlsl::float32_t3 upVector = nbl::core::convertToHLSLVector(getUpVector()).xyz; + nbl::hlsl::float32_t3 right = nbl::hlsl::normalize(nbl::hlsl::cross(upVector, forward)); + nbl::hlsl::float32_t3 correctedForward = nbl::hlsl::normalize(nbl::hlsl::cross(right, upVector)); + + // horizontal yaw (angle from correctedForward towards right) + float rightDot = nbl::hlsl::dot(targetVec, right); + float forwardDot = nbl::hlsl::dot(targetVec, correctedForward); + float relativeRotationY = atan2(rightDot, forwardDot); + + // pitch: angle above/below horizontal + float upDot = nbl::hlsl::dot(targetVec, upVector); + nbl::hlsl::float32_t3 horizontalComponent = targetVec - upVector * upDot; + float horizontalLength = nbl::hlsl::length(horizontalComponent); + float relativeRotationX = atan2(upDot, horizontalLength); + + // apply mouse/controller deltas (signs simplified) + constexpr float RotateSpeedScale = 0.003f; + relativeRotationX -= ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale; + float tmpYRot = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale; if (leftHanded) - relativeRotationY -= tmpYRot; - else relativeRotationY += tmpYRot; - - const double MaxVerticalAngle = nbl::core::radians(88.0f); - - if (relativeRotationX > MaxVerticalAngle*2 && relativeRotationX < 2 * nbl::core::PI()-MaxVerticalAngle) - relativeRotationX = 2 * nbl::core::PI()-MaxVerticalAngle; else - if (relativeRotationX > MaxVerticalAngle && relativeRotationX < 2 * nbl::core::PI()-MaxVerticalAngle) - relativeRotationX = MaxVerticalAngle; - - pos.w = 0; - localTarget = nbl::hlsl::float32_t4(0, 0, nbl::core::max(1.f, nbl::hlsl::length(pos)), 1.0f); + relativeRotationY -= tmpYRot; - const nbl::hlsl::math::quaternion quat = nbl::hlsl::math::quaternion::create(relativeRotationX, relativeRotationY, 0.0f); - nbl::hlsl::float32_t3x4 mat = nbl::hlsl::math::linalg::promote_affine<3, 4, 3, 3>(quat.__constructMatrix()); + // clamp pitch + const float MaxVerticalAngle = nbl::core::radians(88.0f); + if (relativeRotationX > MaxVerticalAngle) relativeRotationX = MaxVerticalAngle; + if (relativeRotationX < -MaxVerticalAngle) relativeRotationX = -MaxVerticalAngle; + // build final direction by first yaw-rotating in the horizontal plane, then pitching + float cosYaw = cos(relativeRotationY); + float sinYaw = sin(relativeRotationY); + nbl::hlsl::float32_t3 yawForward = correctedForward * cosYaw + right * sinYaw; + yawForward = nbl::hlsl::normalize(yawForward); - localTarget = nbl::hlsl::float32_t4(nbl::hlsl::mul(mat, localTarget), 1.0f); + float cosPitch = cos(relativeRotationX); + float sinPitch = sin(relativeRotationX); + nbl::hlsl::float32_t3 finalDir = nbl::hlsl::normalize(yawForward * cosPitch + upVector * sinPitch); - nbl::core::vectorSIMDf finalTarget = nbl::core::constructVecorSIMDFromHLSLVector(localTarget + pos); + // restore original distance and set target + nbl::core::vectorSIMDf finalTarget = nbl::core::constructVecorSIMDFromHLSLVector(pos + finalDir * targetDistance); finalTarget.w = 1.0f; setTarget(finalTarget); + } } } void keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events) { - for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) + for (uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) perActionDt[k] = 0.0; /* @@ -205,8 +224,8 @@ class Camera * And If an UP event was sent It will get subtracted it from this value. (Currently Disabled Because we Need better Oracle) */ - for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) - if(keysDown[k]) + for (uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) + if (keysDown[k]) { auto timeDiff = std::chrono::duration_cast(nextPresentationTimeStamp - lastVirtualUpTimeStamp).count(); if (timeDiff < 0) @@ -214,28 +233,28 @@ class Camera perActionDt[k] += timeDiff; } - for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) + for (auto eventIt = events.begin(); eventIt != events.end(); eventIt++) { const auto ev = *eventIt; - + // accumulate the periods for which a key was down auto timeDiff = std::chrono::duration_cast(nextPresentationTimeStamp - ev.timeStamp).count(); if (timeDiff < 0) timeDiff = 0; // handle camera movement - for (const auto logicalKey : { ECMK_MOVE_FORWARD, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT }) + for (const auto logicalKey : { ECMK_MOVE_FORWARD, ECMK_MOVE_UP, ECMK_MOVE_DOWN, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT }) { const auto code = keysMap[logicalKey]; if (ev.keyCode == code) { - if (ev.action == nbl::ui::SKeyboardEvent::ECA_PRESSED && !keysDown[logicalKey]) + if (ev.action == nbl::ui::SKeyboardEvent::ECA_PRESSED && !keysDown[logicalKey]) { perActionDt[logicalKey] += timeDiff; keysDown[logicalKey] = true; } - else if (ev.action == nbl::ui::SKeyboardEvent::ECA_RELEASED) + else if (ev.action == nbl::ui::SKeyboardEvent::ECA_RELEASED) { // perActionDt[logicalKey] -= timeDiff; keysDown[logicalKey] = false; @@ -259,7 +278,7 @@ class Camera nextPresentationTimeStamp = _nextPresentationTimeStamp; return; } - + void endInputProcessing(std::chrono::microseconds _nextPresentationTimeStamp) { nbl::core::vectorSIMDf pos = getPosition(); @@ -271,13 +290,12 @@ class Camera movedir.makeSafe3D(); movedir = nbl::core::normalize(movedir); - constexpr float MoveSpeedScale = 0.02f; + constexpr float MoveSpeedScale = 0.02f; pos += movedir * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_FORWARD] * moveSpeed * MoveSpeedScale; pos -= movedir * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_BACKWARD] * moveSpeed * MoveSpeedScale; - // strafing - + // if upvector and vector to the target are the same, we have a // problem. so solve this problem: nbl::core::vectorSIMDf up = nbl::core::normalize(upVector); @@ -288,6 +306,11 @@ class Camera up = nbl::core::normalize(backupUpVector); } + nbl::core::vectorSIMDf currentUp = nbl::core::normalize(nbl::core::cross(localTarget, nbl::core::cross(up, localTarget))); + pos += currentUp * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_UP] * moveSpeed * MoveSpeedScale; + pos -= currentUp * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_DOWN] * moveSpeed * MoveSpeedScale; + + // strafing nbl::core::vectorSIMDf strafevect = localTarget; if (leftHanded) strafevect = nbl::core::cross(strafevect, up); @@ -303,18 +326,23 @@ class Camera firstUpdate = false; setPosition(pos); - setTarget(localTarget+pos); + setTarget(localTarget + pos); lastVirtualUpTimeStamp = nextPresentationTimeStamp; } + // TODO: temporary but a good fix for the camera events when mouse stops dragging gizmo + void mouseKeysUp() + { + mouseDown = false; + } private: inline void initDefaultKeysMap() { mapKeysToWASD(); } - - inline void allKeysUp() + + inline void allKeysUp() { - for (uint32_t i=0; i< E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++i) + for (uint32_t i = 0; i < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++i) keysDown[i] = false; mouseDown = false; @@ -327,7 +355,7 @@ class Camera float moveSpeed, rotateSpeed; bool leftHanded, firstUpdate = true, mouseDown = false; - + std::array keysMap = { {nbl::ui::EKC_NONE} }; // map camera E_CAMERA_MOVE_KEYS to corresponding Nabla key codes, by default camera uses WSAD to move // TODO: make them use std::array bool keysDown[E_CAMERA_MOVE_KEYS::ECMK_COUNT] = {};