From 6f16cf5491102ef1f5d21cd6c201562c0a915372 Mon Sep 17 00:00:00 2001
From: shaoboyan <shaoboyan@microsoft.com>
Date: Wed, 31 Dec 2025 16:32:37 +0800
Subject: [PATCH 1/5] Operation tests for setImmediates

Implement operation tests for setImmediates in ComputePassEncoder,
RenderPassEncoder, and RenderBundleEncoder.

- Add basic execution tests for scalar, vector, and struct types.
- Add tests for partial updates and multiple updates (using range verification).
- Add tests for pipeline switching (no inheritance).
- Add tests for large data (maxImmediateSize) with range verification.
- Add tests for TypedArray arguments with offsets.
- Add tests for mixing render pass and bundle execution.
---
 .../programmable/immediate.spec.ts            | 974 ++++++++++++++++++
 1 file changed, 974 insertions(+)
 create mode 100644 src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
diff --git a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
new file mode 100644
index 000000000000..a2bda7c7bf9e
--- /dev/null
+++ b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
@@ -0,0 +1,974 @@
+export const description = `
+Operation tests for immediate data usage in RenderPassEncoder, ComputePassEncoder, and RenderBundleEncoder.
+`;
+
+import { makeTestGroup } from '../../../../../common/framework/test_group.js';
+import { getGPU } from '../../../../../common/util/navigator_gpu.js';
+import {
+  assert,
+  kTypedArrayBufferViews,
+  kTypedArrayBufferViewKeys,
+  supportsImmediateData,
+  unreachable,
+} from '../../../../../common/util/util.js';
+import { AllFeaturesMaxLimitsGPUTest } from '../../../../gpu_test.js';
+import { HostSharableTypes, kVectorContainerTypes, ScalarType } from '../../../../shader/types.js';
+import {
+  kProgrammableEncoderTypes,
+  ProgrammableEncoderType,
+} from '../../../../util/command_buffer_maker.js';
+
+class ImmediateDataOperationTest extends AllFeaturesMaxLimitsGPUTest {
+  override async init() {
+    await super.init();
+
+    if (!supportsImmediateData(getGPU(this.rec))) {
+      this.skip('Immediate data not supported');
+      return;
+    }
+  }
+
+  skipIfStorageBuffersInFragmentStageNotAvailable(encoderType: ProgrammableEncoderType) {
+    if (!this.isCompatibility) {
+      return;
+    }
+    const needsStorageBuffersInFragmentStage =
+      encoderType === 'render pass' || encoderType === 'render bundle';
+    this.skipIf(
+      needsStorageBuffersInFragmentStage &&
+        !(this.device.limits.maxStorageBuffersInFragmentStage! >= 1),
+      `maxStorageBuffersInFragmentStage(${this.device.limits.maxStorageBuffersInFragmentStage}) < 1`
+    );
+  }
+}
+
+function createPipeline(
+  t: AllFeaturesMaxLimitsGPUTest,
+  encoderType: ProgrammableEncoderType,
+  wgslDecl: string,
+  wgslUsage: string,
+  immediateSize: number
+) {
+  const layout = t.device.createPipelineLayout({
+    bindGroupLayouts: [
+      t.device.createBindGroupLayout({
+        entries: [
+          {
+            binding: 0,
+            visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT,
+            buffer: { type: 'storage' },
+          },
+          {
+            binding: 1,
+            visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT,
+            buffer: { type: 'uniform', hasDynamicOffset: true },
+          },
+        ],
+      }),
+    ],
+    immediateSize,
+  });
+
+  const fullCode = `
+    ${wgslDecl}
+    @group(0) @binding(0) var<storage, read_write> output: array<u32>;
+    @group(0) @binding(1) var<uniform> outIndex: u32;
+
+    ${wgslUsage}
+  `;
+
+  if (encoderType === 'compute pass') {
+    return t.device.createComputePipeline({
+      layout,
+      compute: {
+        module: t.device.createShaderModule({ code: fullCode }),
+        entryPoint: 'cs_main',
+      },
+    });
+  } else {
+    return t.device.createRenderPipeline({
+      layout,
+      vertex: {
+        module: t.device.createShaderModule({
+          code: `
+            @vertex fn vs_main(@builtin(vertex_index) vIdx: u32) -> @builtin(position) vec4f {
+              // Map vIdx 0..3 to pixel centers.
+              // Uses a 4x1 texture.
+              // vIdx 0 -> pixel 0 -> x in [-1, -0.5]
+              // vIdx 1 -> pixel 1 -> x in [-0.5, 0]
+              let x = (f32(vIdx) + 0.5) / 2.0 - 1.0;
+              return vec4f(x, 0.0, 0.0, 1.0);
+            }
+          `,
+        }),
+        entryPoint: 'vs_main',
+      },
+      fragment: {
+        module: t.device.createShaderModule({ code: fullCode }),
+        entryPoint: 'fs_main',
+        targets: [{ format: 'r32uint' }],
+      },
+      primitive: {
+        topology: 'point-list',
+      },
+    });
+  }
+}
+
+function executePass(
+  t: AllFeaturesMaxLimitsGPUTest,
+  encoderType: ProgrammableEncoderType,
+  commandEncoder: GPUCommandEncoder,
+  fn: (pass: GPURenderPassEncoder | GPUComputePassEncoder | GPURenderBundleEncoder) => void
+) {
+  if (encoderType === 'compute pass') {
+    const pass = commandEncoder.beginComputePass();
+    fn(pass);
+    pass.end();
+  } else {
+    const renderTargetTexture = t.createTextureTracked({
+      size: [4, 1, 1],
+      format: 'r32uint',
+      usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_SRC,
+    });
+
+    if (encoderType === 'render pass') {
+      const pass = commandEncoder.beginRenderPass({
+        colorAttachments: [
+          {
+            view: renderTargetTexture.createView(),
+            loadOp: 'clear',
+            storeOp: 'store',
+            clearValue: { r: 0, g: 0, b: 0, a: 0 },
+          },
+        ],
+      });
+      fn(pass);
+      pass.end();
+    } else {
+      // Render Bundle
+      const bundleEncoder = t.device.createRenderBundleEncoder({
+        colorFormats: ['r32uint'],
+      });
+      fn(bundleEncoder);
+      const bundle = bundleEncoder.finish();
+
+      const pass = commandEncoder.beginRenderPass({
+        colorAttachments: [
+          {
+            view: renderTargetTexture.createView(),
+            loadOp: 'clear',
+            storeOp: 'store',
+            clearValue: { r: 0, g: 0, b: 0, a: 0 },
+          },
+        ],
+      });
+      pass.executeBundles([bundle]);
+      pass.end();
+    }
+  }
+}
+
+function runAndCheck(
+  t: AllFeaturesMaxLimitsGPUTest,
+  encoderType: ProgrammableEncoderType,
+  pipeline: GPURenderPipeline | GPUComputePipeline,
+  setImmediatesFn: (
+    encoder: GPUComputePassEncoder | GPURenderPassEncoder | GPURenderBundleEncoder
+  ) => void,
+  expectedValues: number[]
+) {
+  const outputBuffer = t.createBufferTracked({
+    size: 4 * 4, // 4 u32s
+    usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+  });
+  const indexUniformBuffer = t.createBufferTracked({
+    size: 256 * 4, // Enough for dynamic offsets
+    usage: GPUBufferUsage.UNIFORM,
+    mappedAtCreation: true,
+  });
+  // Initialize indices 0, 1, 2, 3 at offsets 0, 256, 512, 768
+  const indexData = new Uint32Array(indexUniformBuffer.getMappedRange());
+  indexData[0] = 0;
+  indexData[256 / 4] = 1;
+  indexData[512 / 4] = 2;
+  indexData[768 / 4] = 3;
+  indexUniformBuffer.unmap();
+
+  const bindGroup = t.device.createBindGroup({
+    layout: pipeline.getBindGroupLayout(0),
+    entries: [
+      { binding: 0, resource: { buffer: outputBuffer } },
+      { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+    ],
+  });
+
+  const commandEncoder = t.device.createCommandEncoder();
+  executePass(t, encoderType, commandEncoder, encoder => {
+    if (encoderType === 'compute pass') {
+      const pass = encoder as GPUComputePassEncoder;
+      pass.setPipeline(pipeline as GPUComputePipeline);
+      pass.setBindGroup(0, bindGroup, [0]); // Index 0
+      setImmediatesFn(pass);
+      pass.dispatchWorkgroups(1);
+    } else if (encoderType === 'render pass') {
+      const pass = encoder as GPURenderPassEncoder;
+      pass.setPipeline(pipeline as GPURenderPipeline);
+      pass.setBindGroup(0, bindGroup, [0]);
+      setImmediatesFn(pass);
+      pass.draw(1, 1, 0, 0); // Vertex 0 -> Pixel 0
+    } else if (encoderType === 'render bundle') {
+      const bundleEncoder = encoder as GPURenderBundleEncoder;
+      bundleEncoder.setPipeline(pipeline as GPURenderPipeline);
+      bundleEncoder.setBindGroup(0, bindGroup, [0]);
+      setImmediatesFn(bundleEncoder);
+      bundleEncoder.draw(1, 1, 0, 0);
+    } else {
+      unreachable();
+    }
+  });
+
+  t.device.queue.submit([commandEncoder.finish()]);
+
+  t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array(expectedValues));
+}
+
+export const g = makeTestGroup(ImmediateDataOperationTest);
+
+g.test('basic_execution')
+  .desc('Verify immediate data is correctly passed to shaders.')
+  .params(u =>
+    u.combine('encoderType', kProgrammableEncoderTypes).expandWithParams(function* () {
+      // Scalars
+      for (const s of HostSharableTypes) {
+        yield { dataType: s, scalarType: s, vectorSize: 1 };
+      }
+      // Vectors
+      for (const v of kVectorContainerTypes) {
+        const size = parseInt(v[3]);
+        for (const s of HostSharableTypes) {
+          yield { dataType: `${v}<${s}>`, scalarType: s, vectorSize: size };
+        }
+      }
+      // Struct
+      yield { dataType: 'struct', scalarType: undefined, vectorSize: undefined };
+    })
+  )
+  .fn(t => {
+    const { encoderType, dataType, scalarType, vectorSize } = t.params;
+    t.skipIf(scalarType === 'f16', 'Immediate data blocks do not yet support f16 types');
+    t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
+
+    let wgslDecl = '';
+    let wgslUsage = '';
+    let immediateSize = 0;
+    let expected: number[] = [];
+    let inputData: Uint32Array;
+
+    if (dataType === 'struct') {
+      immediateSize = 8;
+      wgslDecl = `
+        struct S { a: u32, b: u32 }
+        var<immediate> data: S;
+      `;
+      wgslUsage = `
+        @compute @workgroup_size(1) fn cs_main() {
+          output[0] = data.a; output[1] = data.b;
+        }
+        @fragment fn fs_main() -> @location(0) vec4u {
+          output[0] = data.a; output[1] = data.b;
+          return vec4u(0);
+        }
+      `;
+      inputData = new Uint32Array([0xdeadbeef, 0xcafebabe]);
+      expected = [0xdeadbeef, 0xcafebabe];
+    } else {
+      // Non-struct types (scalar or vector)
+      const sType = scalarType as ScalarType;
+      const vSize = vectorSize as number;
+
+      immediateSize = vSize * 4;
+      wgslDecl = `var<immediate> data: ${dataType};`;
+
+      let readCode = '';
+      for (let i = 0; i < vSize; i++) {
+        let valExpr = vSize === 1 ? 'data' : `data[${i}]`;
+        if (sType === 'i32' || sType === 'f32') {
+          valExpr = `bitcast<u32>(${valExpr})`;
+        }
+        readCode += `output[${i}] = ${valExpr};`;
+      }
+
+      wgslUsage = `
+        @compute @workgroup_size(1) fn cs_main() {
+          ${readCode}
+        }
+        @fragment fn fs_main() -> @location(0) vec4u {
+          ${readCode}
+          return vec4u(0);
+        }
+      `;
+
+      inputData = new Uint32Array(vSize);
+      for (let i = 0; i < vSize; i++) {
+        if (sType === 'u32') {
+          const val = 0x10000000 + i;
+          inputData[i] = val;
+          expected.push(val);
+        } else if (sType === 'i32') {
+          const val = -1000 - i;
+          inputData[i] = new Uint32Array(new Int32Array([val]).buffer)[0];
+          expected.push(inputData[i]);
+        } else if (sType === 'f32') {
+          const val = 1.5 + i;
+          inputData[i] = new Uint32Array(new Float32Array([val]).buffer)[0];
+          expected.push(inputData[i]);
+        } else {
+          unreachable(`Unhandled scalar type: ${sType}`);
+        }
+      }
+    }
+
+    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, immediateSize);
+
+    runAndCheck(
+      t,
+      encoderType,
+      pipeline,
+      encoder => {
+        encoder.setImmediates!(0, inputData.buffer, 0, inputData.buffer.byteLength);
+      },
+      expected
+    );
+  });
+
+g.test('update_data')
+  .desc('Verify setImmediates updates data correctly within a pass, including partial updates.')
+  .params(u => u.combine('encoderType', kProgrammableEncoderTypes))
+  .fn(t => {
+    const { encoderType } = t.params;
+    t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
+    const immediateSize = 16;
+    const wgslDecl = 'var<immediate> data: vec4<u32>;';
+    const wgslUsage = `
+      @compute @workgroup_size(1) fn cs_main() {
+        let base = outIndex * 4;
+        output[base + 0] = data[0];
+        output[base + 1] = data[1];
+        output[base + 2] = data[2];
+        output[base + 3] = data[3];
+      }
+      @fragment fn fs_main() -> @location(0) vec4u {
+        let base = outIndex * 4;
+        output[base + 0] = data[0];
+        output[base + 1] = data[1];
+        output[base + 2] = data[2];
+        output[base + 3] = data[3];
+        return vec4u(0);
+      }
+    `;
+
+    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, immediateSize);
+
+    const outputBuffer = t.createBufferTracked({
+      size: 4 * 4 * 3, // 3 steps, 4 u32s each
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    });
+
+    const indexUniformBuffer = t.createBufferTracked({
+      size: 256 * 3,
+      usage: GPUBufferUsage.UNIFORM,
+      mappedAtCreation: true,
+    });
+    const indexData = new Uint32Array(indexUniformBuffer.getMappedRange());
+    indexData[0] = 0;
+    indexData[256 / 4] = 1;
+    indexData[512 / 4] = 2;
+    indexUniformBuffer.unmap();
+
+    const bindGroup = t.device.createBindGroup({
+      layout: pipeline.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+
+    const commandEncoder = t.device.createCommandEncoder();
+
+    const runStep = (
+      pass: GPURenderPassEncoder | GPUComputePassEncoder | GPURenderBundleEncoder,
+      stepIndex: number,
+      data: Uint32Array,
+      dstOffset: number = 0
+    ) => {
+      pass.setBindGroup(0, bindGroup, [stepIndex * 256]);
+      pass.setImmediates!(dstOffset, data, 0, data.length);
+
+      if (encoderType === 'compute pass') {
+        (pass as GPUComputePassEncoder).dispatchWorkgroups(1);
+      } else if (encoderType === 'render pass') {
+        (pass as GPURenderPassEncoder).draw(1, 1, 0, 0);
+      } else {
+        (pass as GPURenderBundleEncoder).draw(1, 1, 0, 0);
+      }
+    };
+
+    executePass(t, encoderType, commandEncoder, enc => {
+      if (encoderType === 'compute pass') {
+        (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
+      } else if (encoderType === 'render pass') {
+        (enc as GPURenderPassEncoder).setPipeline(pipeline as GPURenderPipeline);
+      } else {
+        (enc as GPURenderBundleEncoder).setPipeline(pipeline as GPURenderPipeline);
+      }
+
+      // Step 1: Full set [1, 2, 3, 4]
+      runStep(enc, 0, new Uint32Array([1, 2, 3, 4]));
+
+      // Step 2: Full update [5, 6, 7, 8]
+      runStep(enc, 1, new Uint32Array([5, 6, 7, 8]));
+
+      // Step 3: Partial update offset 4 bytes (index 1) with [9, 10] -> [5, 9, 10, 8]
+      runStep(enc, 2, new Uint32Array([9, 10]), 4);
+    });
+
+    t.device.queue.submit([commandEncoder.finish()]);
+
+    const expected = new Uint32Array([1, 2, 3, 4, 5, 6, 7, 8, 5, 9, 10, 8]);
+    t.expectGPUBufferValuesEqual(outputBuffer, expected);
+  });
+
+g.test('pipeline_switch')
+  .desc(
+    `Verify immediate data is correctly set after switching pipelines.
+    - compatible: Both pipelines use the same immediateSize (same layout).
+    - incompatible: Pipelines use different immediateSize values (different layouts).
+    In both cases, immediates must be reset correctly between draws/dispatches.`
+  )
+  .params(u =>
+    u
+      .combine('encoderType', ['render pass', 'compute pass'] as const)
+      .combine('compatibility', ['compatible', 'incompatible'] as const)
+  )
+  .fn(t => {
+    const { encoderType, compatibility } = t.params;
+    t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
+
+    // Pipeline A always uses vec4<u32> (16 bytes).
+    const wgslDeclA = 'var<immediate> data: vec4<u32>;';
+    const wgslUsageA = `
+      @compute @workgroup_size(1) fn cs_main() {
+        let base = outIndex * 4;
+        output[base] = data.x; output[base+1] = data.y; output[base+2] = data.z; output[base+3] = data.w;
+      }
+      @fragment fn fs_main() -> @location(0) vec4u {
+        let base = outIndex * 4;
+        output[base] = data.x; output[base+1] = data.y; output[base+2] = data.z; output[base+3] = data.w;
+        return vec4u(0);
+      }
+      `;
+
+    let wgslDeclB: string;
+    let wgslUsageB: string;
+    let immediateSizeB: number;
+
+    if (compatibility === 'compatible') {
+      // Pipeline B has the same immediate layout as A (vec4<u32>, 16 bytes).
+      wgslDeclB = wgslDeclA;
+      wgslUsageB = wgslUsageA;
+      immediateSizeB = 16;
+    } else {
+      // Pipeline B uses vec2<u32> (8 bytes) — different/incompatible layout.
+      wgslDeclB = 'var<immediate> data: vec2<u32>;';
+      wgslUsageB = `
+        @compute @workgroup_size(1) fn cs_main() {
+          let base = outIndex * 4;
+          output[base] = data.x; output[base+1] = data.y; output[base+2] = 0u; output[base+3] = 0u;
+        }
+        @fragment fn fs_main() -> @location(0) vec4u {
+          let base = outIndex * 4;
+          output[base] = data.x; output[base+1] = data.y; output[base+2] = 0u; output[base+3] = 0u;
+          return vec4u(0);
+        }
+        `;
+      immediateSizeB = 8;
+    }
+
+    const pipelineA = createPipeline(t, encoderType, wgslDeclA, wgslUsageA, 16);
+    const pipelineB = createPipeline(t, encoderType, wgslDeclB, wgslUsageB, immediateSizeB);
+
+    const outputBuffer = t.createBufferTracked({
+      size: 32,
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    });
+    const indexUniformBuffer = t.createBufferTracked({
+      size: 512,
+      usage: GPUBufferUsage.UNIFORM,
+      mappedAtCreation: true,
+    });
+    const indexData = new Uint32Array(indexUniformBuffer.getMappedRange());
+    indexData[0] = 0;
+    indexData[256 / 4] = 1;
+    indexUniformBuffer.unmap();
+
+    const bindGroupA = t.device.createBindGroup({
+      layout: pipelineA.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+    const bindGroupB = t.device.createBindGroup({
+      layout: pipelineB.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+
+    const commandEncoder = t.device.createCommandEncoder();
+
+    if (compatibility === 'compatible') {
+      executePass(t, encoderType, commandEncoder, enc => {
+        if (encoderType === 'compute pass') {
+          const pass = enc as GPUComputePassEncoder;
+          // 1. Set Pipeline A, set immediates [1, 2, 3, 4], dispatch
+          pass.setPipeline(pipelineA as GPUComputePipeline);
+          pass.setBindGroup(0, bindGroupA, [0]);
+          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
+          pass.dispatchWorkgroups(1);
+
+          // 2. Switch to Pipeline B (compatible), set new immediates [5, 6, 7, 8], dispatch
+          pass.setPipeline(pipelineB as GPUComputePipeline);
+          pass.setBindGroup(0, bindGroupB, [256]);
+          pass.setImmediates!(0, new Uint32Array([5, 6, 7, 8]), 0, 4);
+          pass.dispatchWorkgroups(1);
+        } else {
+          const pass = enc as GPURenderPassEncoder;
+          // 1. Set Pipeline A, set immediates [1, 2, 3, 4], draw
+          pass.setPipeline(pipelineA as GPURenderPipeline);
+          pass.setBindGroup(0, bindGroupA, [0]);
+          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
+          pass.draw(1, 1, 0, 0);
+
+          // 2. Switch to Pipeline B (compatible), set new immediates [5, 6, 7, 8], draw
+          pass.setPipeline(pipelineB as GPURenderPipeline);
+          pass.setBindGroup(0, bindGroupB, [256]);
+          pass.setImmediates!(0, new Uint32Array([5, 6, 7, 8]), 0, 4);
+          pass.draw(1, 1, 1, 0);
+        }
+      });
+
+      t.device.queue.submit([commandEncoder.finish()]);
+
+      const expected = new Uint32Array([1, 2, 3, 4, 5, 6, 7, 8]);
+      t.expectGPUBufferValuesEqual(outputBuffer, expected);
+    } else {
+      // Incompatible: Pipeline A uses 16 bytes, Pipeline B uses 8 bytes.
+      executePass(t, encoderType, commandEncoder, enc => {
+        if (encoderType === 'compute pass') {
+          const pass = enc as GPUComputePassEncoder;
+          // 1. Set Pipeline A (16 bytes), set immediates [1, 2, 3, 4], dispatch
+          pass.setPipeline(pipelineA as GPUComputePipeline);
+          pass.setBindGroup(0, bindGroupA, [0]);
+          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
+          pass.dispatchWorkgroups(1);
+
+          // 2. Switch to Pipeline B (8 bytes), set new immediates [5, 6], dispatch
+          pass.setPipeline(pipelineB as GPUComputePipeline);
+          pass.setBindGroup(0, bindGroupB, [256]);
+          pass.setImmediates!(0, new Uint32Array([5, 6]), 0, 2);
+          pass.dispatchWorkgroups(1);
+        } else {
+          const pass = enc as GPURenderPassEncoder;
+          // 1. Set Pipeline A (16 bytes), set immediates [1, 2, 3, 4], draw
+          pass.setPipeline(pipelineA as GPURenderPipeline);
+          pass.setBindGroup(0, bindGroupA, [0]);
+          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
+          pass.draw(1, 1, 0, 0);
+
+          // 2. Switch to Pipeline B (8 bytes), set new immediates [5, 6], draw
+          pass.setPipeline(pipelineB as GPURenderPipeline);
+          pass.setBindGroup(0, bindGroupB, [256]);
+          pass.setImmediates!(0, new Uint32Array([5, 6]), 0, 2);
+          pass.draw(1, 1, 1, 0);
+        }
+      });
+
+      t.device.queue.submit([commandEncoder.finish()]);
+
+      // Pipeline A wrote [1, 2, 3, 4]; Pipeline B wrote [5, 6, 0, 0] (only 2 fields, rest zeroed).
+      const expected = new Uint32Array([1, 2, 3, 4, 5, 6, 0, 0]);
+      t.expectGPUBufferValuesEqual(outputBuffer, expected);
+    }
+  });
+
+g.test('use_max_immediate_size')
+  .desc('Verify setImmediates with maxImmediateSize.')
+  .params(u => u.combine('encoderType', kProgrammableEncoderTypes))
+  .fn(t => {
+    const { encoderType } = t.params;
+    t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
+    assert(t.device.limits.maxImmediateSize !== undefined);
+    const maxImmediateSize = t.device.limits.maxImmediateSize;
+    // Create a pipeline that reads the first and last u32 of the immediate data
+    const count = maxImmediateSize / 4;
+    const members: string[] = [];
+    for (let i = 0; i < count; i++) {
+      members.push(`m${i}: u32`);
+    }
+    const wgslDecl = `struct Large { ${members.join(', ')} } var<immediate> data: Large;`;
+    const wgslUsage = `
+      @compute @workgroup_size(1) fn cs_main() {
+        output[0] = data.m0;
+        output[1] = data.m${count - 1};
+      }
+      @fragment fn fs_main() -> @location(0) vec4u {
+        output[0] = data.m0;
+        output[1] = data.m${count - 1};
+        return vec4u(0);
+      }
+    `;
+
+    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, maxImmediateSize);
+
+    const outputBuffer = t.createBufferTracked({
+      size: 8,
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    });
+    const indexUniformBuffer = t.makeBufferWithContents(
+      new Uint32Array(256 / 4),
+      GPUBufferUsage.UNIFORM
+    );
+
+    const bindGroup = t.device.createBindGroup({
+      layout: pipeline.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+
+    const commandEncoder = t.device.createCommandEncoder();
+    executePass(t, encoderType, commandEncoder, enc => {
+      const data = new Uint32Array(count);
+      data[0] = 0xdeadbeef;
+      data[count - 1] = 0xcafebabe;
+
+      if (encoderType === 'compute pass') {
+        const pass = enc as GPUComputePassEncoder;
+        pass.setPipeline(pipeline as GPUComputePipeline);
+        pass.setBindGroup(0, bindGroup, [0]);
+        pass.setImmediates!(0, data, 0, count);
+        pass.dispatchWorkgroups(1);
+      } else {
+        const pass = enc as GPURenderPassEncoder | GPURenderBundleEncoder;
+        pass.setPipeline(pipeline as GPURenderPipeline);
+        pass.setBindGroup(0, bindGroup, [0]);
+        pass.setImmediates!(0, data, 0, count);
+        pass.draw(1, 1, 0, 0);
+      }
+    });
+
+    t.device.queue.submit([commandEncoder.finish()]);
+    t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array([0xdeadbeef, 0xcafebabe]));
+  });
+
+g.test('typed_array_arguments')
+  .desc('Verify srcOffset and srcSize are in elements for TypedArrays.')
+  .params(u =>
+    u
+      .combine('typedArray', kTypedArrayBufferViewKeys)
+      .combine('encoderType', kProgrammableEncoderTypes)
+  )
+  .fn(t => {
+    const { typedArray, encoderType } = t.params;
+    t.skipIf(typedArray === 'Float16Array', 'TODO(#4297): Float16Array not yet supported');
+    t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
+    const Ctor = kTypedArrayBufferViews[typedArray];
+    const elementSize = Ctor.BYTES_PER_ELEMENT;
+
+    // Write a known pattern.
+    // Use a buffer of 8 bytes.
+    // Pattern: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08
+    const bytePattern = [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
+
+    // Create a large buffer to test offset.
+    // Use offset = 2 elements.
+    // Need enough data.
+    // Construct the TypedArray such that at index `offset`, the pattern exists.
+    const offset = 2;
+    const elementCount = 8 / elementSize; // Write 8 bytes.
+
+    // Total elements needed: offset + elementCount + padding
+    const totalElements = offset + elementCount + 2;
+    const arr = new Ctor(totalElements);
+
+    // Fill with non-zero value.
+    const fillView = new Uint8Array(arr.buffer);
+    fillView.fill(0xaa);
+
+    // The bytes at `arr[offset]...` should match `bytePattern`.
+    // Use a DataView on the array's buffer to set the bytes.
+    const buffer = arr.buffer;
+    const byteOffset = arr.byteOffset + offset * elementSize;
+    const view = new DataView(buffer);
+
+    for (let i = 0; i < 8; i++) {
+      view.setUint8(byteOffset + i, bytePattern[i]);
+    }
+
+    // Now `arr` contains the pattern at `offset`.
+
+    // Shader: read 2 u32s (8 bytes).
+    const wgslDecl = 'var<immediate> data: array<u32, 2>;';
+    const wgslUsage = `
+      @compute @workgroup_size(1) fn cs_main() {
+        output[0] = data[0];
+        output[1] = data[1];
+      }
+      @fragment fn fs_main() -> @location(0) vec4u {
+        output[0] = data[0];
+        output[1] = data[1];
+        return vec4u(0);
+      }
+    `;
+
+    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, 8);
+
+    const outputBuffer = t.createBufferTracked({
+      size: 8,
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    });
+    const indexUniformBuffer = t.makeBufferWithContents(
+      new Uint32Array(256 / 4),
+      GPUBufferUsage.UNIFORM
+    );
+
+    const bindGroup = t.device.createBindGroup({
+      layout: pipeline.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+
+    const commandEncoder = t.device.createCommandEncoder();
+    executePass(t, encoderType, commandEncoder, enc => {
+      if (encoderType === 'compute pass') {
+        const pass = enc as GPUComputePassEncoder;
+        pass.setPipeline(pipeline as GPUComputePipeline);
+        pass.setBindGroup(0, bindGroup, [0]);
+        pass.setImmediates!(0, arr, offset, elementCount);
+        pass.dispatchWorkgroups(1);
+      } else {
+        const pass = enc as GPURenderPassEncoder | GPURenderBundleEncoder;
+        pass.setPipeline(pipeline as GPURenderPipeline);
+        pass.setBindGroup(0, bindGroup, [0]);
+        pass.setImmediates!(0, arr, offset, elementCount);
+        pass.draw(1, 1, 0, 0);
+      }
+    });
+
+    t.device.queue.submit([commandEncoder.finish()]);
+
+    // Expected: 0x04030201, 0x08070605 (Little Endian)
+    const expected = new Uint32Array([0x04030201, 0x08070605]);
+    t.expectGPUBufferValuesEqual(outputBuffer, expected);
+  });
+
+g.test('multiple_updates_before_draw_or_dispatch')
+  .desc(
+    'Verify that multiple setImmediates calls before a draw or dispatch result in the latest content being used (merging updates).'
+  )
+  .params(u => u.combine('encoderType', kProgrammableEncoderTypes))
+  .fn(t => {
+    const { encoderType } = t.params;
+    t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
+    // Use vec4<u32> to allow partial updates.
+    const wgslDecl = 'var<immediate> data: vec4<u32>;';
+    const wgslUsage = `
+      @compute @workgroup_size(1) fn cs_main() {
+        output[0] = data.x; output[1] = data.y; output[2] = data.z; output[3] = data.w;
+      }
+      @fragment fn fs_main() -> @location(0) vec4u {
+        output[0] = data.x; output[1] = data.y; output[2] = data.z; output[3] = data.w;
+        return vec4u(0);
+      }
+    `;
+    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, 16);
+
+    runAndCheck(
+      t,
+      encoderType,
+      pipeline,
+      encoder => {
+        // 1. Set all to [1, 2, 3, 4]
+        encoder.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
+        // 2. Update middle two to [5, 6] -> [1, 5, 6, 4]
+        encoder.setImmediates!(4, new Uint32Array([5, 6]), 0, 2);
+        // 3. Update last to [7] -> [1, 5, 6, 7]
+        encoder.setImmediates!(12, new Uint32Array([7]), 0, 1);
+      },
+      [1, 5, 6, 7]
+    );
+  });
+
+g.test('render_pass_and_bundle_mix')
+  .desc('Verify interaction between executeBundles and direct render pass commands.')
+  .fn(t => {
+    t.skipIfStorageBuffersInFragmentStageNotAvailable('render pass');
+    const wgslDecl = 'var<immediate> data: vec2<u32>;';
+    const wgslUsage = `
+      @fragment fn fs_main() -> @location(0) vec4u {
+        let base = outIndex * 2;
+        output[base] = data.x;
+        output[base + 1] = data.y;
+        return vec4u(0);
+      }
+    `;
+    // Use 'render pass' type to create the pipeline, but it works for bundle too.
+    // Immediate size: vec2<u32> = 2 * 4 bytes = 8 bytes.
+    const pipeline = createPipeline(t, 'render pass', wgslDecl, wgslUsage, 8) as GPURenderPipeline;
+
+    const outputBuffer = t.createBufferTracked({
+      size: 16,
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    });
+    const indexUniformBuffer = t.createBufferTracked({
+      size: 512,
+      usage: GPUBufferUsage.UNIFORM,
+      mappedAtCreation: true,
+    });
+    const idxData = new Uint32Array(indexUniformBuffer.getMappedRange());
+    idxData[0] = 0;
+    idxData[256 / 4] = 1;
+    indexUniformBuffer.unmap();
+
+    const bindGroup = t.device.createBindGroup({
+      layout: pipeline.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+
+    // Bundle: Set [1, 10], Draw (Index 0)
+    const bundleEncoder = t.device.createRenderBundleEncoder({ colorFormats: ['r32uint'] });
+    bundleEncoder.setPipeline(pipeline);
+    bundleEncoder.setBindGroup(0, bindGroup, [0]);
+    bundleEncoder.setImmediates!(0, new Uint32Array([1, 10]), 0, 2);
+    bundleEncoder.draw(1, 1, 0, 0);
+    const bundle = bundleEncoder.finish();
+
+    const renderTargetTexture = t.createTextureTracked({
+      size: [4, 1, 1],
+      format: 'r32uint',
+      usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_SRC,
+    });
+    const commandEncoder = t.device.createCommandEncoder();
+    const pass = commandEncoder.beginRenderPass({
+      colorAttachments: [
+        {
+          view: renderTargetTexture.createView(),
+          loadOp: 'clear',
+          storeOp: 'store',
+          clearValue: { r: 0, g: 0, b: 0, a: 0 },
+        },
+      ],
+    });
+
+    // Execute Bundle
+    pass.executeBundles([bundle]);
+
+    // Pass: Set [2, 20], Draw (Index 1)
+    pass.setPipeline(pipeline);
+    pass.setBindGroup(0, bindGroup, [256]);
+    pass.setImmediates!(0, new Uint32Array([2, 20]), 0, 2);
+    pass.draw(1, 1, 1, 0);
+
+    pass.end();
+    t.device.queue.submit([commandEncoder.finish()]);
+
+    t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array([1, 10, 2, 20]));
+  });
+
+g.test('render_bundle_isolation')
+  .desc('Verify that immediate data state is isolated between bundles executed in the same pass.')
+  .fn(t => {
+    t.skipIfStorageBuffersInFragmentStageNotAvailable('render bundle');
+    const wgslDecl = 'var<immediate> data: vec2<u32>;';
+    const wgslUsage = `
+      @fragment fn fs_main() -> @location(0) vec4u {
+        let base = outIndex * 2;
+        output[base] = data.x;
+        output[base + 1] = data.y;
+        return vec4u(0);
+      }
+    `;
+    const pipeline = createPipeline(t, 'render pass', wgslDecl, wgslUsage, 8) as GPURenderPipeline;
+
+    const outputBuffer = t.createBufferTracked({
+      size: 16, // 2 draws * 2 u32s * 4 bytes
+      usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
+    });
+    const indexUniformBuffer = t.createBufferTracked({
+      size: 512,
+      usage: GPUBufferUsage.UNIFORM,
+      mappedAtCreation: true,
+    });
+    const idxData = new Uint32Array(indexUniformBuffer.getMappedRange());
+    idxData[0] = 0;
+    idxData[256 / 4] = 1;
+    indexUniformBuffer.unmap();
+
+    const bindGroup = t.device.createBindGroup({
+      layout: pipeline.getBindGroupLayout(0),
+      entries: [
+        { binding: 0, resource: { buffer: outputBuffer } },
+        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
+      ],
+    });
+
+    // Bundle A: Set [1, 2], Draw (Index 0)
+    const bundleEncoderA = t.device.createRenderBundleEncoder({ colorFormats: ['r32uint'] });
+    bundleEncoderA.setPipeline(pipeline);
+    bundleEncoderA.setBindGroup(0, bindGroup, [0]);
+    bundleEncoderA.setImmediates!(0, new Uint32Array([1, 2]), 0, 2);
+    bundleEncoderA.draw(1, 1, 0, 0);
+    const bundleA = bundleEncoderA.finish();
+
+    // Bundle B: Set [3, 4], Draw (Index 1)
+    const bundleEncoderB = t.device.createRenderBundleEncoder({ colorFormats: ['r32uint'] });
+    bundleEncoderB.setPipeline(pipeline);
+    bundleEncoderB.setBindGroup(0, bindGroup, [256]);
+    bundleEncoderB.setImmediates!(0, new Uint32Array([3, 4]), 0, 2);
+    bundleEncoderB.draw(1, 1, 1, 0);
+    const bundleB = bundleEncoderB.finish();
+
+    const renderTargetTexture = t.createTextureTracked({
+      size: [4, 1, 1],
+      format: 'r32uint',
+      usage: GPUTextureUsage.RENDER_ATTACHMENT | GPUTextureUsage.COPY_SRC,
+    });
+    const commandEncoder = t.device.createCommandEncoder();
+    const pass = commandEncoder.beginRenderPass({
+      colorAttachments: [
+        {
+          view: renderTargetTexture.createView(),
+          loadOp: 'clear',
+          storeOp: 'store',
+          clearValue: { r: 0, g: 0, b: 0, a: 0 },
+        },
+      ],
+    });
+
+    // Execute Bundles
+    pass.executeBundles([bundleA, bundleB]);
+
+    pass.end();
+    t.device.queue.submit([commandEncoder.finish()]);
+
+    t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array([1, 2, 3, 4]));
+  });

From 079a9cd5d89915d838819b2206d409d02bb448cb Mon Sep 17 00:00:00 2001
From: shaoboyan <shaoboyan@microsoft.com>
Date: Mon, 9 Mar 2026 16:41:10 +0800
Subject: [PATCH 2/5] Address comments

---
 .../programmable/immediate.spec.ts            | 481 ++++++------------
 1 file changed, 165 insertions(+), 316 deletions(-)

diff --git a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
index a2bda7c7bf9e..3f50558b3df1 100644
--- a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
+++ b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
@@ -46,7 +46,7 @@ function createPipeline(
   t: AllFeaturesMaxLimitsGPUTest,
   encoderType: ProgrammableEncoderType,
   wgslDecl: string,
-  wgslUsage: string,
+  copyCode: string,
   immediateSize: number
 ) {
   const layout = t.device.createPipelineLayout({
@@ -74,7 +74,13 @@ function createPipeline(
     @group(0) @binding(0) var<storage, read_write> output: array<u32>;
     @group(0) @binding(1) var<uniform> outIndex: u32;
 
-    ${wgslUsage}
+    @compute @workgroup_size(1) fn cs_main() {
+      ${copyCode}
+    }
+    @fragment fn fs_main() -> @location(0) vec4u {
+      ${copyCode}
+      return vec4u(0);
+    }
   `;
 
   if (encoderType === 'compute pass') {
@@ -82,7 +88,6 @@ function createPipeline(
       layout,
       compute: {
         module: t.device.createShaderModule({ code: fullCode }),
-        entryPoint: 'cs_main',
       },
     });
   } else {
@@ -92,20 +97,16 @@ function createPipeline(
         module: t.device.createShaderModule({
           code: `
             @vertex fn vs_main(@builtin(vertex_index) vIdx: u32) -> @builtin(position) vec4f {
-              // Map vIdx 0..3 to pixel centers.
-              // Uses a 4x1 texture.
-              // vIdx 0 -> pixel 0 -> x in [-1, -0.5]
-              // vIdx 1 -> pixel 1 -> x in [-0.5, 0]
+              // Map vIdx 0..3 to pixel centers in a 4x1 render target.
+              // firstVertex selects which pixel to render to.
               let x = (f32(vIdx) + 0.5) / 2.0 - 1.0;
               return vec4f(x, 0.0, 0.0, 1.0);
             }
           `,
         }),
-        entryPoint: 'vs_main',
       },
       fragment: {
         module: t.device.createShaderModule({ code: fullCode }),
-        entryPoint: 'fs_main',
         targets: [{ format: 'r32uint' }],
       },
       primitive: {
@@ -115,7 +116,35 @@ function createPipeline(
   }
 }
 
-function executePass(
+/** Dispatch or draw based on encoder type. firstVertex selects the output pixel for render passes. */
+function dispatchOrDraw(
+  encoderType: ProgrammableEncoderType,
+  encoder: GPUComputePassEncoder | GPURenderPassEncoder | GPURenderBundleEncoder,
+  firstVertex: number = 0
+) {
+  if (encoderType === 'compute pass') {
+    (encoder as GPUComputePassEncoder).dispatchWorkgroups(1);
+  } else {
+    (encoder as GPURenderPassEncoder | GPURenderBundleEncoder).draw(1, 1, firstVertex, 0);
+  }
+}
+
+/** Create a uniform buffer with output indices at 256-byte aligned offsets for dynamic binding. */
+function createOutputIndexBuffer(t: AllFeaturesMaxLimitsGPUTest, count: number): GPUBuffer {
+  const buffer = t.createBufferTracked({
+    size: 256 * count,
+    usage: GPUBufferUsage.UNIFORM,
+    mappedAtCreation: true,
+  });
+  const data = new Uint32Array(buffer.getMappedRange());
+  for (let i = 0; i < count; i++) {
+    data[(i * 256) / 4] = i;
+  }
+  buffer.unmap();
+  return buffer;
+}
+
+function encodeForPassType(
   t: AllFeaturesMaxLimitsGPUTest,
   encoderType: ProgrammableEncoderType,
   commandEncoder: GPUCommandEncoder,
@@ -179,21 +208,11 @@ function runAndCheck(
   expectedValues: number[]
 ) {
   const outputBuffer = t.createBufferTracked({
-    size: 4 * 4, // 4 u32s
+    size: expectedValues.length * 4,
     usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
   });
-  const indexUniformBuffer = t.createBufferTracked({
-    size: 256 * 4, // Enough for dynamic offsets
-    usage: GPUBufferUsage.UNIFORM,
-    mappedAtCreation: true,
-  });
-  // Initialize indices 0, 1, 2, 3 at offsets 0, 256, 512, 768
-  const indexData = new Uint32Array(indexUniformBuffer.getMappedRange());
-  indexData[0] = 0;
-  indexData[256 / 4] = 1;
-  indexData[512 / 4] = 2;
-  indexData[768 / 4] = 3;
-  indexUniformBuffer.unmap();
+  // Simple uniform for outIndex = 0. Dynamic offset is always [0].
+  const indexUniformBuffer = t.makeBufferWithContents(new Uint32Array([0]), GPUBufferUsage.UNIFORM);
 
   const bindGroup = t.device.createBindGroup({
     layout: pipeline.getBindGroupLayout(0),
@@ -204,28 +223,17 @@ function runAndCheck(
   });
 
   const commandEncoder = t.device.createCommandEncoder();
-  executePass(t, encoderType, commandEncoder, encoder => {
+  encodeForPassType(t, encoderType, commandEncoder, encoder => {
     if (encoderType === 'compute pass') {
-      const pass = encoder as GPUComputePassEncoder;
-      pass.setPipeline(pipeline as GPUComputePipeline);
-      pass.setBindGroup(0, bindGroup, [0]); // Index 0
-      setImmediatesFn(pass);
-      pass.dispatchWorkgroups(1);
-    } else if (encoderType === 'render pass') {
-      const pass = encoder as GPURenderPassEncoder;
-      pass.setPipeline(pipeline as GPURenderPipeline);
-      pass.setBindGroup(0, bindGroup, [0]);
-      setImmediatesFn(pass);
-      pass.draw(1, 1, 0, 0); // Vertex 0 -> Pixel 0
-    } else if (encoderType === 'render bundle') {
-      const bundleEncoder = encoder as GPURenderBundleEncoder;
-      bundleEncoder.setPipeline(pipeline as GPURenderPipeline);
-      bundleEncoder.setBindGroup(0, bindGroup, [0]);
-      setImmediatesFn(bundleEncoder);
-      bundleEncoder.draw(1, 1, 0, 0);
+      (encoder as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
     } else {
-      unreachable();
+      (encoder as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
+        pipeline as GPURenderPipeline
+      );
     }
+    encoder.setBindGroup(0, bindGroup, [0]);
+    setImmediatesFn(encoder);
+    dispatchOrDraw(encoderType, encoder);
   });
 
   t.device.queue.submit([commandEncoder.finish()]);
@@ -260,7 +268,7 @@ g.test('basic_execution')
     t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
 
     let wgslDecl = '';
-    let wgslUsage = '';
+    let copyCode = '';
     let immediateSize = 0;
     let expected: number[] = [];
     let inputData: Uint32Array;
@@ -271,15 +279,7 @@ g.test('basic_execution')
         struct S { a: u32, b: u32 }
         var<immediate> data: S;
       `;
-      wgslUsage = `
-        @compute @workgroup_size(1) fn cs_main() {
-          output[0] = data.a; output[1] = data.b;
-        }
-        @fragment fn fs_main() -> @location(0) vec4u {
-          output[0] = data.a; output[1] = data.b;
-          return vec4u(0);
-        }
-      `;
+      copyCode = 'output[0] = data.a; output[1] = data.b;';
       inputData = new Uint32Array([0xdeadbeef, 0xcafebabe]);
       expected = [0xdeadbeef, 0xcafebabe];
     } else {
@@ -290,24 +290,13 @@ g.test('basic_execution')
       immediateSize = vSize * 4;
       wgslDecl = `var<immediate> data: ${dataType};`;
 
+      // bitcast<u32> is identity for u32, so we can use it unconditionally.
       let readCode = '';
       for (let i = 0; i < vSize; i++) {
-        let valExpr = vSize === 1 ? 'data' : `data[${i}]`;
-        if (sType === 'i32' || sType === 'f32') {
-          valExpr = `bitcast<u32>(${valExpr})`;
-        }
-        readCode += `output[${i}] = ${valExpr};`;
+        const valExpr = vSize === 1 ? 'data' : `data[${i}]`;
+        readCode += `output[${i}] = bitcast<u32>(${valExpr});\n`;
       }
-
-      wgslUsage = `
-        @compute @workgroup_size(1) fn cs_main() {
-          ${readCode}
-        }
-        @fragment fn fs_main() -> @location(0) vec4u {
-          ${readCode}
-          return vec4u(0);
-        }
-      `;
+      copyCode = readCode;
 
       inputData = new Uint32Array(vSize);
       for (let i = 0; i < vSize; i++) {
@@ -329,14 +318,14 @@ g.test('basic_execution')
       }
     }
 
-    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, immediateSize);
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, immediateSize);
 
     runAndCheck(
       t,
       encoderType,
       pipeline,
       encoder => {
-        encoder.setImmediates!(0, inputData.buffer, 0, inputData.buffer.byteLength);
+        encoder.setImmediates!(0, inputData, 0, inputData.length);
       },
       expected
     );
@@ -350,41 +339,22 @@ g.test('update_data')
     t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
     const immediateSize = 16;
     const wgslDecl = 'var<immediate> data: vec4<u32>;';
-    const wgslUsage = `
-      @compute @workgroup_size(1) fn cs_main() {
-        let base = outIndex * 4;
-        output[base + 0] = data[0];
-        output[base + 1] = data[1];
-        output[base + 2] = data[2];
-        output[base + 3] = data[3];
-      }
-      @fragment fn fs_main() -> @location(0) vec4u {
-        let base = outIndex * 4;
-        output[base + 0] = data[0];
-        output[base + 1] = data[1];
-        output[base + 2] = data[2];
-        output[base + 3] = data[3];
-        return vec4u(0);
-      }
+    const copyCode = `
+      let base = outIndex * 4;
+      output[base + 0] = data[0];
+      output[base + 1] = data[1];
+      output[base + 2] = data[2];
+      output[base + 3] = data[3];
     `;
 
-    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, immediateSize);
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, immediateSize);
 
     const outputBuffer = t.createBufferTracked({
       size: 4 * 4 * 3, // 3 steps, 4 u32s each
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
 
-    const indexUniformBuffer = t.createBufferTracked({
-      size: 256 * 3,
-      usage: GPUBufferUsage.UNIFORM,
-      mappedAtCreation: true,
-    });
-    const indexData = new Uint32Array(indexUniformBuffer.getMappedRange());
-    indexData[0] = 0;
-    indexData[256 / 4] = 1;
-    indexData[512 / 4] = 2;
-    indexUniformBuffer.unmap();
+    const indexUniformBuffer = createOutputIndexBuffer(t, 3);
 
     const bindGroup = t.device.createBindGroup({
       layout: pipeline.getBindGroupLayout(0),
@@ -396,6 +366,7 @@ g.test('update_data')
 
     const commandEncoder = t.device.createCommandEncoder();
 
+    /** Set bind group with dynamic offset for output index, set immediates, and dispatch/draw. */
     const runStep = (
       pass: GPURenderPassEncoder | GPUComputePassEncoder | GPURenderBundleEncoder,
       stepIndex: number,
@@ -404,23 +375,16 @@ g.test('update_data')
     ) => {
       pass.setBindGroup(0, bindGroup, [stepIndex * 256]);
       pass.setImmediates!(dstOffset, data, 0, data.length);
-
-      if (encoderType === 'compute pass') {
-        (pass as GPUComputePassEncoder).dispatchWorkgroups(1);
-      } else if (encoderType === 'render pass') {
-        (pass as GPURenderPassEncoder).draw(1, 1, 0, 0);
-      } else {
-        (pass as GPURenderBundleEncoder).draw(1, 1, 0, 0);
-      }
+      dispatchOrDraw(encoderType, pass);
     };
 
-    executePass(t, encoderType, commandEncoder, enc => {
+    encodeForPassType(t, encoderType, commandEncoder, enc => {
       if (encoderType === 'compute pass') {
         (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
-      } else if (encoderType === 'render pass') {
-        (enc as GPURenderPassEncoder).setPipeline(pipeline as GPURenderPipeline);
       } else {
-        (enc as GPURenderBundleEncoder).setPipeline(pipeline as GPURenderPipeline);
+        (enc as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
+          pipeline as GPURenderPipeline
+        );
       }
 
       // Step 1: Full set [1, 2, 3, 4]
@@ -442,166 +406,100 @@ g.test('update_data')
 g.test('pipeline_switch')
   .desc(
     `Verify immediate data is correctly set after switching pipelines.
-    - compatible: Both pipelines use the same immediateSize (same layout).
-    - incompatible: Pipelines use different immediateSize values (different layouts).
-    In both cases, immediates must be reset correctly between draws/dispatches.`
+    - sameImmediateSize=true: Both pipelines use the same immediateSize.
+    - sameImmediateSize=false: Pipelines use different immediateSize values.
+    In both cases, immediates must be set correctly between draws/dispatches.`
   )
   .params(u =>
     u
       .combine('encoderType', ['render pass', 'compute pass'] as const)
-      .combine('compatibility', ['compatible', 'incompatible'] as const)
+      .combine('sameImmediateSize', [true, false] as const)
   )
   .fn(t => {
-    const { encoderType, compatibility } = t.params;
+    const { encoderType, sameImmediateSize } = t.params;
     t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
 
     // Pipeline A always uses vec4<u32> (16 bytes).
     const wgslDeclA = 'var<immediate> data: vec4<u32>;';
-    const wgslUsageA = `
-      @compute @workgroup_size(1) fn cs_main() {
-        let base = outIndex * 4;
-        output[base] = data.x; output[base+1] = data.y; output[base+2] = data.z; output[base+3] = data.w;
-      }
-      @fragment fn fs_main() -> @location(0) vec4u {
-        let base = outIndex * 4;
-        output[base] = data.x; output[base+1] = data.y; output[base+2] = data.z; output[base+3] = data.w;
-        return vec4u(0);
-      }
-      `;
+    const copyCodeA = `
+      let base = outIndex * 4;
+      output[base] = data.x; output[base+1] = data.y; output[base+2] = data.z; output[base+3] = data.w;
+    `;
 
     let wgslDeclB: string;
-    let wgslUsageB: string;
+    let copyCodeB: string;
     let immediateSizeB: number;
 
-    if (compatibility === 'compatible') {
+    if (sameImmediateSize) {
       // Pipeline B has the same immediate layout as A (vec4<u32>, 16 bytes).
       wgslDeclB = wgslDeclA;
-      wgslUsageB = wgslUsageA;
+      copyCodeB = copyCodeA;
       immediateSizeB = 16;
     } else {
       // Pipeline B uses vec2<u32> (8 bytes) — different/incompatible layout.
       wgslDeclB = 'var<immediate> data: vec2<u32>;';
-      wgslUsageB = `
-        @compute @workgroup_size(1) fn cs_main() {
-          let base = outIndex * 4;
-          output[base] = data.x; output[base+1] = data.y; output[base+2] = 0u; output[base+3] = 0u;
-        }
-        @fragment fn fs_main() -> @location(0) vec4u {
-          let base = outIndex * 4;
-          output[base] = data.x; output[base+1] = data.y; output[base+2] = 0u; output[base+3] = 0u;
-          return vec4u(0);
-        }
-        `;
+      copyCodeB = `
+        let base = outIndex * 4;
+        output[base] = data.x; output[base+1] = data.y; output[base+2] = 0u; output[base+3] = 0u;
+      `;
       immediateSizeB = 8;
     }
 
-    const pipelineA = createPipeline(t, encoderType, wgslDeclA, wgslUsageA, 16);
-    const pipelineB = createPipeline(t, encoderType, wgslDeclB, wgslUsageB, immediateSizeB);
+    const pipelineA = createPipeline(t, encoderType, wgslDeclA, copyCodeA, 16);
+    const pipelineB = createPipeline(t, encoderType, wgslDeclB, copyCodeB, immediateSizeB);
 
     const outputBuffer = t.createBufferTracked({
       size: 32,
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
-    const indexUniformBuffer = t.createBufferTracked({
-      size: 512,
-      usage: GPUBufferUsage.UNIFORM,
-      mappedAtCreation: true,
-    });
-    const indexData = new Uint32Array(indexUniformBuffer.getMappedRange());
-    indexData[0] = 0;
-    indexData[256 / 4] = 1;
-    indexUniformBuffer.unmap();
+    const indexUniformBuffer = createOutputIndexBuffer(t, 2);
 
-    const bindGroupA = t.device.createBindGroup({
+    // Use a single bind group. Both pipelines have the same bind group layout,
+    // so the same bind group is compatible with both. This verifies that
+    // previously bound descriptor sets remain valid after a pipeline switch
+    // (important for Vulkan correctness).
+    const bindGroup = t.device.createBindGroup({
       layout: pipelineA.getBindGroupLayout(0),
       entries: [
         { binding: 0, resource: { buffer: outputBuffer } },
         { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
       ],
     });
-    const bindGroupB = t.device.createBindGroup({
-      layout: pipelineB.getBindGroupLayout(0),
-      entries: [
-        { binding: 0, resource: { buffer: outputBuffer } },
-        { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
-      ],
-    });
 
-    const commandEncoder = t.device.createCommandEncoder();
+    const immA = new Uint32Array([1, 2, 3, 4]);
+    const immB = sameImmediateSize ? new Uint32Array([1, 2, 3, 4]) : new Uint32Array([1, 2]);
 
-    if (compatibility === 'compatible') {
-      executePass(t, encoderType, commandEncoder, enc => {
-        if (encoderType === 'compute pass') {
-          const pass = enc as GPUComputePassEncoder;
-          // 1. Set Pipeline A, set immediates [1, 2, 3, 4], dispatch
-          pass.setPipeline(pipelineA as GPUComputePipeline);
-          pass.setBindGroup(0, bindGroupA, [0]);
-          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
-          pass.dispatchWorkgroups(1);
-
-          // 2. Switch to Pipeline B (compatible), set new immediates [5, 6, 7, 8], dispatch
-          pass.setPipeline(pipelineB as GPUComputePipeline);
-          pass.setBindGroup(0, bindGroupB, [256]);
-          pass.setImmediates!(0, new Uint32Array([5, 6, 7, 8]), 0, 4);
-          pass.dispatchWorkgroups(1);
-        } else {
-          const pass = enc as GPURenderPassEncoder;
-          // 1. Set Pipeline A, set immediates [1, 2, 3, 4], draw
-          pass.setPipeline(pipelineA as GPURenderPipeline);
-          pass.setBindGroup(0, bindGroupA, [0]);
-          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
-          pass.draw(1, 1, 0, 0);
-
-          // 2. Switch to Pipeline B (compatible), set new immediates [5, 6, 7, 8], draw
-          pass.setPipeline(pipelineB as GPURenderPipeline);
-          pass.setBindGroup(0, bindGroupB, [256]);
-          pass.setImmediates!(0, new Uint32Array([5, 6, 7, 8]), 0, 4);
-          pass.draw(1, 1, 1, 0);
-        }
-      });
-
-      t.device.queue.submit([commandEncoder.finish()]);
+    const commandEncoder = t.device.createCommandEncoder();
+    encodeForPassType(t, encoderType, commandEncoder, enc => {
+      // Pipeline A: set immediates [1, 2, 3, 4], dispatch/draw at outIndex 0.
+      if (encoderType === 'compute pass') {
+        (enc as GPUComputePassEncoder).setPipeline(pipelineA as GPUComputePipeline);
+      } else {
+        (enc as GPURenderPassEncoder).setPipeline(pipelineA as GPURenderPipeline);
+      }
+      enc.setBindGroup(0, bindGroup, [0]);
+      enc.setImmediates!(0, immA, 0, immA.length);
+      dispatchOrDraw(encoderType, enc, 0);
 
-      const expected = new Uint32Array([1, 2, 3, 4, 5, 6, 7, 8]);
-      t.expectGPUBufferValuesEqual(outputBuffer, expected);
-    } else {
-      // Incompatible: Pipeline A uses 16 bytes, Pipeline B uses 8 bytes.
-      executePass(t, encoderType, commandEncoder, enc => {
-        if (encoderType === 'compute pass') {
-          const pass = enc as GPUComputePassEncoder;
-          // 1. Set Pipeline A (16 bytes), set immediates [1, 2, 3, 4], dispatch
-          pass.setPipeline(pipelineA as GPUComputePipeline);
-          pass.setBindGroup(0, bindGroupA, [0]);
-          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
-          pass.dispatchWorkgroups(1);
-
-          // 2. Switch to Pipeline B (8 bytes), set new immediates [5, 6], dispatch
-          pass.setPipeline(pipelineB as GPUComputePipeline);
-          pass.setBindGroup(0, bindGroupB, [256]);
-          pass.setImmediates!(0, new Uint32Array([5, 6]), 0, 2);
-          pass.dispatchWorkgroups(1);
-        } else {
-          const pass = enc as GPURenderPassEncoder;
-          // 1. Set Pipeline A (16 bytes), set immediates [1, 2, 3, 4], draw
-          pass.setPipeline(pipelineA as GPURenderPipeline);
-          pass.setBindGroup(0, bindGroupA, [0]);
-          pass.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
-          pass.draw(1, 1, 0, 0);
-
-          // 2. Switch to Pipeline B (8 bytes), set new immediates [5, 6], draw
-          pass.setPipeline(pipelineB as GPURenderPipeline);
-          pass.setBindGroup(0, bindGroupB, [256]);
-          pass.setImmediates!(0, new Uint32Array([5, 6]), 0, 2);
-          pass.draw(1, 1, 1, 0);
-        }
-      });
+      // Pipeline B: switch pipeline, update dynamic offset for outIndex 1.
+      if (encoderType === 'compute pass') {
+        (enc as GPUComputePassEncoder).setPipeline(pipelineB as GPUComputePipeline);
+      } else {
+        (enc as GPURenderPassEncoder).setPipeline(pipelineB as GPURenderPipeline);
+      }
+      enc.setBindGroup(0, bindGroup, [256]);
+      enc.setImmediates!(0, immB, 0, immB.length);
+      dispatchOrDraw(encoderType, enc, 1);
+    });
 
-      t.device.queue.submit([commandEncoder.finish()]);
+    t.device.queue.submit([commandEncoder.finish()]);
 
-      // Pipeline A wrote [1, 2, 3, 4]; Pipeline B wrote [5, 6, 0, 0] (only 2 fields, rest zeroed).
-      const expected = new Uint32Array([1, 2, 3, 4, 5, 6, 0, 0]);
-      t.expectGPUBufferValuesEqual(outputBuffer, expected);
-    }
+    // Pipeline A wrote [1, 2, 3, 4] at outIndex 0.
+    // Pipeline B wrote [1, 2, 3, 4] (same size) or [1, 2, 0, 0] (different size) at outIndex 1.
+    const expected = sameImmediateSize
+      ? new Uint32Array([1, 2, 3, 4, 1, 2, 3, 4])
+      : new Uint32Array([1, 2, 3, 4, 1, 2, 0, 0]);
+    t.expectGPUBufferValuesEqual(outputBuffer, expected);
   });
 
 g.test('use_max_immediate_size')
@@ -619,19 +517,12 @@ g.test('use_max_immediate_size')
       members.push(`m${i}: u32`);
     }
     const wgslDecl = `struct Large { ${members.join(', ')} } var<immediate> data: Large;`;
-    const wgslUsage = `
-      @compute @workgroup_size(1) fn cs_main() {
-        output[0] = data.m0;
-        output[1] = data.m${count - 1};
-      }
-      @fragment fn fs_main() -> @location(0) vec4u {
-        output[0] = data.m0;
-        output[1] = data.m${count - 1};
-        return vec4u(0);
-      }
+    const copyCode = `
+      output[0] = data.m0;
+      output[1] = data.m${count - 1};
     `;
 
-    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, maxImmediateSize);
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, maxImmediateSize);
 
     const outputBuffer = t.createBufferTracked({
       size: 8,
@@ -651,24 +542,21 @@ g.test('use_max_immediate_size')
     });
 
     const commandEncoder = t.device.createCommandEncoder();
-    executePass(t, encoderType, commandEncoder, enc => {
+    encodeForPassType(t, encoderType, commandEncoder, enc => {
       const data = new Uint32Array(count);
       data[0] = 0xdeadbeef;
       data[count - 1] = 0xcafebabe;
 
       if (encoderType === 'compute pass') {
-        const pass = enc as GPUComputePassEncoder;
-        pass.setPipeline(pipeline as GPUComputePipeline);
-        pass.setBindGroup(0, bindGroup, [0]);
-        pass.setImmediates!(0, data, 0, count);
-        pass.dispatchWorkgroups(1);
+        (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
       } else {
-        const pass = enc as GPURenderPassEncoder | GPURenderBundleEncoder;
-        pass.setPipeline(pipeline as GPURenderPipeline);
-        pass.setBindGroup(0, bindGroup, [0]);
-        pass.setImmediates!(0, data, 0, count);
-        pass.draw(1, 1, 0, 0);
+        (enc as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
+          pipeline as GPURenderPipeline
+        );
       }
+      enc.setBindGroup(0, bindGroup, [0]);
+      enc.setImmediates!(0, data, 0, count);
+      dispatchOrDraw(encoderType, enc);
     });
 
     t.device.queue.submit([commandEncoder.finish()]);
@@ -723,19 +611,12 @@ g.test('typed_array_arguments')
 
     // Shader: read 2 u32s (8 bytes).
     const wgslDecl = 'var<immediate> data: array<u32, 2>;';
-    const wgslUsage = `
-      @compute @workgroup_size(1) fn cs_main() {
-        output[0] = data[0];
-        output[1] = data[1];
-      }
-      @fragment fn fs_main() -> @location(0) vec4u {
-        output[0] = data[0];
-        output[1] = data[1];
-        return vec4u(0);
-      }
+    const copyCode = `
+      output[0] = data[0];
+      output[1] = data[1];
     `;
 
-    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, 8);
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, 8);
 
     const outputBuffer = t.createBufferTracked({
       size: 8,
@@ -755,20 +636,17 @@ g.test('typed_array_arguments')
     });
 
     const commandEncoder = t.device.createCommandEncoder();
-    executePass(t, encoderType, commandEncoder, enc => {
+    encodeForPassType(t, encoderType, commandEncoder, enc => {
       if (encoderType === 'compute pass') {
-        const pass = enc as GPUComputePassEncoder;
-        pass.setPipeline(pipeline as GPUComputePipeline);
-        pass.setBindGroup(0, bindGroup, [0]);
-        pass.setImmediates!(0, arr, offset, elementCount);
-        pass.dispatchWorkgroups(1);
+        (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
       } else {
-        const pass = enc as GPURenderPassEncoder | GPURenderBundleEncoder;
-        pass.setPipeline(pipeline as GPURenderPipeline);
-        pass.setBindGroup(0, bindGroup, [0]);
-        pass.setImmediates!(0, arr, offset, elementCount);
-        pass.draw(1, 1, 0, 0);
+        (enc as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
+          pipeline as GPURenderPipeline
+        );
       }
+      enc.setBindGroup(0, bindGroup, [0]);
+      enc.setImmediates!(0, arr, offset, elementCount);
+      dispatchOrDraw(encoderType, enc);
     });
 
     t.device.queue.submit([commandEncoder.finish()]);
@@ -788,16 +666,9 @@ g.test('multiple_updates_before_draw_or_dispatch')
     t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
     // Use vec4<u32> to allow partial updates.
     const wgslDecl = 'var<immediate> data: vec4<u32>;';
-    const wgslUsage = `
-      @compute @workgroup_size(1) fn cs_main() {
-        output[0] = data.x; output[1] = data.y; output[2] = data.z; output[3] = data.w;
-      }
-      @fragment fn fs_main() -> @location(0) vec4u {
-        output[0] = data.x; output[1] = data.y; output[2] = data.z; output[3] = data.w;
-        return vec4u(0);
-      }
-    `;
-    const pipeline = createPipeline(t, encoderType, wgslDecl, wgslUsage, 16);
+    const copyCode =
+      'output[0] = data.x; output[1] = data.y; output[2] = data.z; output[3] = data.w;';
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, 16);
 
     runAndCheck(
       t,
@@ -820,31 +691,20 @@ g.test('render_pass_and_bundle_mix')
   .fn(t => {
     t.skipIfStorageBuffersInFragmentStageNotAvailable('render pass');
     const wgslDecl = 'var<immediate> data: vec2<u32>;';
-    const wgslUsage = `
-      @fragment fn fs_main() -> @location(0) vec4u {
-        let base = outIndex * 2;
-        output[base] = data.x;
-        output[base + 1] = data.y;
-        return vec4u(0);
-      }
+    const copyCode = `
+      let base = outIndex * 2;
+      output[base] = data.x;
+      output[base + 1] = data.y;
     `;
     // Use 'render pass' type to create the pipeline, but it works for bundle too.
     // Immediate size: vec2<u32> = 2 * 4 bytes = 8 bytes.
-    const pipeline = createPipeline(t, 'render pass', wgslDecl, wgslUsage, 8) as GPURenderPipeline;
+    const pipeline = createPipeline(t, 'render pass', wgslDecl, copyCode, 8) as GPURenderPipeline;
 
     const outputBuffer = t.createBufferTracked({
       size: 16,
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
-    const indexUniformBuffer = t.createBufferTracked({
-      size: 512,
-      usage: GPUBufferUsage.UNIFORM,
-      mappedAtCreation: true,
-    });
-    const idxData = new Uint32Array(indexUniformBuffer.getMappedRange());
-    idxData[0] = 0;
-    idxData[256 / 4] = 1;
-    indexUniformBuffer.unmap();
+    const indexUniformBuffer = createOutputIndexBuffer(t, 2);
 
     const bindGroup = t.device.createBindGroup({
       layout: pipeline.getBindGroupLayout(0),
@@ -899,29 +759,18 @@ g.test('render_bundle_isolation')
   .fn(t => {
     t.skipIfStorageBuffersInFragmentStageNotAvailable('render bundle');
     const wgslDecl = 'var<immediate> data: vec2<u32>;';
-    const wgslUsage = `
-      @fragment fn fs_main() -> @location(0) vec4u {
-        let base = outIndex * 2;
-        output[base] = data.x;
-        output[base + 1] = data.y;
-        return vec4u(0);
-      }
+    const copyCode = `
+      let base = outIndex * 2;
+      output[base] = data.x;
+      output[base + 1] = data.y;
     `;
-    const pipeline = createPipeline(t, 'render pass', wgslDecl, wgslUsage, 8) as GPURenderPipeline;
+    const pipeline = createPipeline(t, 'render pass', wgslDecl, copyCode, 8) as GPURenderPipeline;
 
     const outputBuffer = t.createBufferTracked({
       size: 16, // 2 draws * 2 u32s * 4 bytes
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
-    const indexUniformBuffer = t.createBufferTracked({
-      size: 512,
-      usage: GPUBufferUsage.UNIFORM,
-      mappedAtCreation: true,
-    });
-    const idxData = new Uint32Array(indexUniformBuffer.getMappedRange());
-    idxData[0] = 0;
-    idxData[256 / 4] = 1;
-    indexUniformBuffer.unmap();
+    const indexUniformBuffer = createOutputIndexBuffer(t, 2);
 
     const bindGroup = t.device.createBindGroup({
       layout: pipeline.getBindGroupLayout(0),

From f57ee4ecc4526926fe1938ce271a308ea5a7c696 Mon Sep 17 00:00:00 2001
From: shaoboyan <shaoboyan@microsoft.com>
Date: Tue, 10 Mar 2026 16:46:46 +0800
Subject: [PATCH 3/5] Iterate

---
 .../programmable/immediate.spec.ts            | 325 ++++++++++--------
 1 file changed, 191 insertions(+), 134 deletions(-)

diff --git a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
index 3f50558b3df1..a7f3a63a8ee4 100644
--- a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
+++ b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
@@ -5,7 +5,6 @@ Operation tests for immediate data usage in RenderPassEncoder, ComputePassEncode
 import { makeTestGroup } from '../../../../../common/framework/test_group.js';
 import { getGPU } from '../../../../../common/util/navigator_gpu.js';
 import {
-  assert,
   kTypedArrayBufferViews,
   kTypedArrayBufferViewKeys,
   supportsImmediateData,
@@ -47,27 +46,30 @@ function createPipeline(
   encoderType: ProgrammableEncoderType,
   wgslDecl: string,
   copyCode: string,
-  immediateSize: number
+  immediateSize: number,
+  pipelineLayout?: GPUPipelineLayout
 ) {
-  const layout = t.device.createPipelineLayout({
-    bindGroupLayouts: [
-      t.device.createBindGroupLayout({
-        entries: [
-          {
-            binding: 0,
-            visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT,
-            buffer: { type: 'storage' },
-          },
-          {
-            binding: 1,
-            visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT,
-            buffer: { type: 'uniform', hasDynamicOffset: true },
-          },
-        ],
-      }),
-    ],
-    immediateSize,
-  });
+  const layout =
+    pipelineLayout ||
+    t.device.createPipelineLayout({
+      bindGroupLayouts: [
+        t.device.createBindGroupLayout({
+          entries: [
+            {
+              binding: 0,
+              visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT,
+              buffer: { type: 'storage' },
+            },
+            {
+              binding: 1,
+              visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT | GPUShaderStage.VERTEX,
+              buffer: { type: 'uniform', hasDynamicOffset: true },
+            },
+          ],
+        }),
+      ],
+      immediateSize,
+    });
 
   const fullCode = `
     ${wgslDecl}
@@ -96,10 +98,13 @@ function createPipeline(
       vertex: {
         module: t.device.createShaderModule({
           code: `
-            @vertex fn vs_main(@builtin(vertex_index) vIdx: u32) -> @builtin(position) vec4f {
-              // Map vIdx 0..3 to pixel centers in a 4x1 render target.
-              // firstVertex selects which pixel to render to.
-              let x = (f32(vIdx) + 0.5) / 2.0 - 1.0;
+            // Re-declare outIndex in the vertex shader
+            @group(0) @binding(1) var<uniform> outIndex: u32;
+
+            @vertex fn vs_main() -> @builtin(position) vec4f {
+              // Map outIndex 0..3 to pixel centers in a 4x1 render target.
+              // x = (f32(outIndex) + 0.5) / 2.0 - 1.0
+              let x = (f32(outIndex) + 0.5) / 2.0 - 1.0;
               return vec4f(x, 0.0, 0.0, 1.0);
             }
           `,
@@ -116,20 +121,23 @@ function createPipeline(
   }
 }
 
-/** Dispatch or draw based on encoder type. firstVertex selects the output pixel for render passes. */
+/** Dispatch or draw based on encoder type. */
 function dispatchOrDraw(
   encoderType: ProgrammableEncoderType,
-  encoder: GPUComputePassEncoder | GPURenderPassEncoder | GPURenderBundleEncoder,
-  firstVertex: number = 0
+  encoder: GPUComputePassEncoder | GPURenderPassEncoder | GPURenderBundleEncoder
 ) {
   if (encoderType === 'compute pass') {
     (encoder as GPUComputePassEncoder).dispatchWorkgroups(1);
   } else {
-    (encoder as GPURenderPassEncoder | GPURenderBundleEncoder).draw(1, 1, firstVertex, 0);
+    (encoder as GPURenderPassEncoder | GPURenderBundleEncoder).draw(1); // 1 Vertex over 1 Instance
   }
 }
 
-/** Create a uniform buffer with output indices at 256-byte aligned offsets for dynamic binding. */
+/**
+ * Create a uniform buffer with output indices at 256-byte aligned offsets for dynamic binding.
+ * This is used instead of other mechanisms to provide an output index (like firstVertex or immediates)
+ * because it works across all shader stages without consuming the immediate data capability that we are actively testing.
+ */
 function createOutputIndexBuffer(t: AllFeaturesMaxLimitsGPUTest, count: number): GPUBuffer {
   const buffer = t.createBufferTracked({
     size: 256 * count,
@@ -198,6 +206,20 @@ function encodeForPassType(
   }
 }
 
+function setPipeline(
+  encoderType: ProgrammableEncoderType,
+  encoder: GPUComputePassEncoder | GPURenderPassEncoder | GPURenderBundleEncoder,
+  pipeline: GPURenderPipeline | GPUComputePipeline
+) {
+  if (encoderType === 'compute pass') {
+    (encoder as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
+  } else {
+    (encoder as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
+      pipeline as GPURenderPipeline
+    );
+  }
+}
+
 function runAndCheck(
   t: AllFeaturesMaxLimitsGPUTest,
   encoderType: ProgrammableEncoderType,
@@ -224,13 +246,7 @@ function runAndCheck(
 
   const commandEncoder = t.device.createCommandEncoder();
   encodeForPassType(t, encoderType, commandEncoder, encoder => {
-    if (encoderType === 'compute pass') {
-      (encoder as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
-    } else {
-      (encoder as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
-        pipeline as GPURenderPipeline
-      );
-    }
+    setPipeline(encoderType, encoder, pipeline);
     encoder.setBindGroup(0, bindGroup, [0]);
     setImmediatesFn(encoder);
     dispatchOrDraw(encoderType, encoder);
@@ -291,12 +307,10 @@ g.test('basic_execution')
       wgslDecl = `var<immediate> data: ${dataType};`;
 
       // bitcast<u32> is identity for u32, so we can use it unconditionally.
-      let readCode = '';
       for (let i = 0; i < vSize; i++) {
         const valExpr = vSize === 1 ? 'data' : `data[${i}]`;
-        readCode += `output[${i}] = bitcast<u32>(${valExpr});\n`;
+        copyCode += `output[${i}] = bitcast<u32>(${valExpr});\n`;
       }
-      copyCode = readCode;
 
       inputData = new Uint32Array(vSize);
       for (let i = 0; i < vSize; i++) {
@@ -379,13 +393,7 @@ g.test('update_data')
     };
 
     encodeForPassType(t, encoderType, commandEncoder, enc => {
-      if (encoderType === 'compute pass') {
-        (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
-      } else {
-        (enc as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
-          pipeline as GPURenderPipeline
-        );
-      }
+      setPipeline(encoderType, enc, pipeline);
 
       // Step 1: Full set [1, 2, 3, 4]
       runStep(enc, 0, new Uint32Array([1, 2, 3, 4]));
@@ -445,8 +453,32 @@ g.test('pipeline_switch')
       immediateSizeB = 8;
     }
 
-    const pipelineA = createPipeline(t, encoderType, wgslDeclA, copyCodeA, 16);
-    const pipelineB = createPipeline(t, encoderType, wgslDeclB, copyCodeB, immediateSizeB);
+    const bindGroupLayout = t.device.createBindGroupLayout({
+      entries: [
+        {
+          binding: 0,
+          visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT,
+          buffer: { type: 'storage' },
+        },
+        {
+          binding: 1,
+          visibility: GPUShaderStage.COMPUTE | GPUShaderStage.FRAGMENT | GPUShaderStage.VERTEX,
+          buffer: { type: 'uniform', hasDynamicOffset: true },
+        },
+      ],
+    });
+
+    const layoutA = t.device.createPipelineLayout({
+      bindGroupLayouts: [bindGroupLayout],
+      immediateSize: 16,
+    });
+    const pipelineA = createPipeline(t, encoderType, wgslDeclA, copyCodeA, 16, layoutA);
+
+    const layoutB = t.device.createPipelineLayout({
+      bindGroupLayouts: [bindGroupLayout],
+      immediateSize: immediateSizeB,
+    });
+    const pipelineB = createPipeline(t, encoderType, wgslDeclB, copyCodeB, immediateSizeB, layoutB);
 
     const outputBuffer = t.createBufferTracked({
       size: 32,
@@ -459,37 +491,29 @@ g.test('pipeline_switch')
     // previously bound descriptor sets remain valid after a pipeline switch
     // (important for Vulkan correctness).
     const bindGroup = t.device.createBindGroup({
-      layout: pipelineA.getBindGroupLayout(0),
+      layout: bindGroupLayout,
       entries: [
         { binding: 0, resource: { buffer: outputBuffer } },
         { binding: 1, resource: { buffer: indexUniformBuffer, size: 4 } },
       ],
     });
 
-    const immA = new Uint32Array([1, 2, 3, 4]);
-    const immB = sameImmediateSize ? new Uint32Array([1, 2, 3, 4]) : new Uint32Array([1, 2]);
+    const immData = new Uint32Array([1, 2, 3, 4]);
 
     const commandEncoder = t.device.createCommandEncoder();
     encodeForPassType(t, encoderType, commandEncoder, enc => {
       // Pipeline A: set immediates [1, 2, 3, 4], dispatch/draw at outIndex 0.
-      if (encoderType === 'compute pass') {
-        (enc as GPUComputePassEncoder).setPipeline(pipelineA as GPUComputePipeline);
-      } else {
-        (enc as GPURenderPassEncoder).setPipeline(pipelineA as GPURenderPipeline);
-      }
+      setPipeline(encoderType, enc, pipelineA);
       enc.setBindGroup(0, bindGroup, [0]);
-      enc.setImmediates!(0, immA, 0, immA.length);
-      dispatchOrDraw(encoderType, enc, 0);
+      enc.setImmediates!(0, immData, 0, immData.length);
+      dispatchOrDraw(encoderType, enc);
 
       // Pipeline B: switch pipeline, update dynamic offset for outIndex 1.
-      if (encoderType === 'compute pass') {
-        (enc as GPUComputePassEncoder).setPipeline(pipelineB as GPUComputePipeline);
-      } else {
-        (enc as GPURenderPassEncoder).setPipeline(pipelineB as GPURenderPipeline);
-      }
+      setPipeline(encoderType, enc, pipelineB);
       enc.setBindGroup(0, bindGroup, [256]);
-      enc.setImmediates!(0, immB, 0, immB.length);
-      dispatchOrDraw(encoderType, enc, 1);
+      // We purposefully don't call setImmediates here to verify that the
+      // immediates set prior to the pipeline switch are preserved.
+      dispatchOrDraw(encoderType, enc);
     });
 
     t.device.queue.submit([commandEncoder.finish()]);
@@ -508,8 +532,13 @@ g.test('use_max_immediate_size')
   .fn(t => {
     const { encoderType } = t.params;
     t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
-    assert(t.device.limits.maxImmediateSize !== undefined);
+
     const maxImmediateSize = t.device.limits.maxImmediateSize;
+    if (maxImmediateSize === undefined) {
+      t.skip('maxImmediateSize limit is undefined');
+      return;
+    }
+
     // Create a pipeline that reads the first and last u32 of the immediate data
     const count = maxImmediateSize / 4;
     const members: string[] = [];
@@ -528,10 +557,7 @@ g.test('use_max_immediate_size')
       size: 8,
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
-    const indexUniformBuffer = t.makeBufferWithContents(
-      new Uint32Array(256 / 4),
-      GPUBufferUsage.UNIFORM
-    );
+    const indexUniformBuffer = createOutputIndexBuffer(t, 1);
 
     const bindGroup = t.device.createBindGroup({
       layout: pipeline.getBindGroupLayout(0),
@@ -547,13 +573,7 @@ g.test('use_max_immediate_size')
       data[0] = 0xdeadbeef;
       data[count - 1] = 0xcafebabe;
 
-      if (encoderType === 'compute pass') {
-        (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
-      } else {
-        (enc as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
-          pipeline as GPURenderPipeline
-        );
-      }
+      setPipeline(encoderType, enc, pipeline);
       enc.setBindGroup(0, bindGroup, [0]);
       enc.setImmediates!(0, data, 0, count);
       dispatchOrDraw(encoderType, enc);
@@ -564,69 +584,91 @@ g.test('use_max_immediate_size')
   });
 
 g.test('typed_array_arguments')
-  .desc('Verify srcOffset and srcSize are in elements for TypedArrays.')
+  .desc('Verify dataOffset and dataSize arguments work correctly for all TypedArray types.')
   .params(u =>
     u
       .combine('typedArray', kTypedArrayBufferViewKeys)
       .combine('encoderType', kProgrammableEncoderTypes)
+      .beginSubcases()
+      .combine('dataOffset', [undefined, 0, 2])
+      .combine('dataSize', [undefined, 2])
+      .filter(t => {
+        // WebGPU requires the byte size of the provided data to `setImmediates` to be a multiple of 4.
+        const elementSize = kTypedArrayBufferViews[t.typedArray].BYTES_PER_ELEMENT;
+        const actualDataOffset = t.dataOffset ?? 0;
+
+        // Let's test a larger size to accommodate BigUint64Array/BigInt64Array matrices more easily
+        const maxElementsIn64Bytes = 64 / elementSize;
+        const actualDataSize = t.dataSize ?? maxElementsIn64Bytes - actualDataOffset;
+        const byteSize = actualDataSize * elementSize;
+
+        return byteSize <= 64 && byteSize % 4 === 0;
+      })
   )
   .fn(t => {
-    const { typedArray, encoderType } = t.params;
+    const { typedArray, encoderType, dataOffset, dataSize } = t.params;
     t.skipIf(typedArray === 'Float16Array', 'TODO(#4297): Float16Array not yet supported');
     t.skipIfStorageBuffersInFragmentStageNotAvailable(encoderType);
     const Ctor = kTypedArrayBufferViews[typedArray];
     const elementSize = Ctor.BYTES_PER_ELEMENT;
 
-    // Write a known pattern.
-    // Use a buffer of 8 bytes.
-    // Pattern: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08
-    const bytePattern = [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
-
-    // Create a large buffer to test offset.
-    // Use offset = 2 elements.
-    // Need enough data.
-    // Construct the TypedArray such that at index `offset`, the pattern exists.
-    const offset = 2;
-    const elementCount = 8 / elementSize; // Write 8 bytes.
-
-    // Total elements needed: offset + elementCount + padding
-    const totalElements = offset + elementCount + 2;
-    const arr = new Ctor(totalElements);
-
-    // Fill with non-zero value.
-    const fillView = new Uint8Array(arr.buffer);
-    fillView.fill(0xaa);
-
-    // The bytes at `arr[offset]...` should match `bytePattern`.
-    // Use a DataView on the array's buffer to set the bytes.
-    const buffer = arr.buffer;
-    const byteOffset = arr.byteOffset + offset * elementSize;
-    const view = new DataView(buffer);
-
-    for (let i = 0; i < 8; i++) {
-      view.setUint8(byteOffset + i, bytePattern[i]);
-    }
-
-    // Now `arr` contains the pattern at `offset`.
-
-    // Shader: read 2 u32s (8 bytes).
-    const wgslDecl = 'var<immediate> data: array<u32, 2>;';
+    // Use a baseline of 16 u32s (64 bytes).
+    const immediateSize = 64;
+    const wgslDecl = `
+      struct ImmediateData {
+        m0: vec4<u32>,
+        m1: vec4<u32>,
+        m2: vec4<u32>,
+        m3: vec4<u32>
+      }
+      var<immediate> data: ImmediateData;
+    `;
     const copyCode = `
-      output[0] = data[0];
-      output[1] = data[1];
+      output[0] = data.m0.x;
+      output[1] = data.m0.y;
+      output[2] = data.m0.z;
+      output[3] = data.m0.w;
+      output[4] = data.m1.x;
+      output[5] = data.m1.y;
+      output[6] = data.m1.z;
+      output[7] = data.m1.w;
+      output[8] = data.m2.x;
+      output[9] = data.m2.y;
+      output[10] = data.m2.z;
+      output[11] = data.m2.w;
+      output[12] = data.m3.x;
+      output[13] = data.m3.y;
+      output[14] = data.m3.z;
+      output[15] = data.m3.w;
     `;
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, immediateSize);
 
-    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, 8);
+    const actualDataOffset = dataOffset ?? 0;
+    // We want to write `dataSize` elements. If dataSize is undefined, it defaults to:
+    // array.length - dataOffset
+    const maxElementsIn64Bytes = 64 / elementSize;
+    const actualDataSize = dataSize ?? maxElementsIn64Bytes - actualDataOffset;
+
+    // Create a typed array buffer. If dataSize is undefined, we shouldn't add padding
+    // because setImmediates uses the rest of the array, which would exceed our intended size or break the % 4 rule.
+    const paddingElements = dataSize === undefined ? 0 : 4;
+    const arr = new Ctor(actualDataOffset + actualDataSize + paddingElements);
+    const view = new DataView(arr.buffer);
+
+    for (let i = 0; i < actualDataSize; i++) {
+      // Just fill some bytes. E.g., repeating byte pattern.
+      for (let b = 0; b < elementSize; b++) {
+        view.setUint8((actualDataOffset + i) * elementSize + b, 0x10 + b + i);
+      }
+    }
+
+    const commandEncoder = t.device.createCommandEncoder();
 
     const outputBuffer = t.createBufferTracked({
-      size: 8,
+      size: 64,
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
-    const indexUniformBuffer = t.makeBufferWithContents(
-      new Uint32Array(256 / 4),
-      GPUBufferUsage.UNIFORM
-    );
-
+    const indexUniformBuffer = createOutputIndexBuffer(t, 1);
     const bindGroup = t.device.createBindGroup({
       layout: pipeline.getBindGroupLayout(0),
       entries: [
@@ -635,24 +677,39 @@ g.test('typed_array_arguments')
       ],
     });
 
-    const commandEncoder = t.device.createCommandEncoder();
     encodeForPassType(t, encoderType, commandEncoder, enc => {
-      if (encoderType === 'compute pass') {
-        (enc as GPUComputePassEncoder).setPipeline(pipeline as GPUComputePipeline);
+      setPipeline(encoderType, enc, pipeline);
+      enc.setBindGroup(0, bindGroup, [0]);
+
+      // Initialize pipeline storage to a clear pattern mapping to up to 64 bytes
+      const clearData = new Uint32Array(16);
+      for (let i = 0; i < 16; i++) clearData[i] = 0xaaaaaaaa + i * 0x11111111;
+      enc.setImmediates!(0, clearData);
+
+      if (dataSize === undefined) {
+        if (dataOffset === undefined) {
+          enc.setImmediates!(0, arr);
+        } else {
+          enc.setImmediates!(0, arr, dataOffset);
+        }
       } else {
-        (enc as GPURenderPassEncoder | GPURenderBundleEncoder).setPipeline(
-          pipeline as GPURenderPipeline
-        );
+        enc.setImmediates!(0, arr, dataOffset, dataSize);
       }
-      enc.setBindGroup(0, bindGroup, [0]);
-      enc.setImmediates!(0, arr, offset, elementCount);
+
       dispatchOrDraw(encoderType, enc);
     });
-
     t.device.queue.submit([commandEncoder.finish()]);
 
-    // Expected: 0x04030201, 0x08070605 (Little Endian)
-    const expected = new Uint32Array([0x04030201, 0x08070605]);
+    const expected = new Uint32Array(16);
+    for (let i = 0; i < 16; i++) expected[i] = 0xaaaaaaaa + i * 0x11111111;
+    const expectedView = new Uint8Array(expected.buffer);
+
+    for (let i = 0; i < actualDataSize; i++) {
+      for (let b = 0; b < elementSize; b++) {
+        expectedView[i * elementSize + b] = 0x10 + b + i;
+      }
+    }
+
     t.expectGPUBufferValuesEqual(outputBuffer, expected);
   });
 

From 1ddbf941afa4cf8e5f590444578cabef828f7adb Mon Sep 17 00:00:00 2001
From: shaoboyan <shaoboyan@microsoft.com>
Date: Wed, 11 Mar 2026 11:31:41 +0800
Subject: [PATCH 4/5] Iterating on typed array case and use outIndex every
 where

---
 .../programmable/immediate.spec.ts            | 179 +++++++++---------
 1 file changed, 93 insertions(+), 86 deletions(-)

diff --git a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
index a7f3a63a8ee4..6db9ab8f7c1b 100644
--- a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
+++ b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
@@ -5,8 +5,10 @@ Operation tests for immediate data usage in RenderPassEncoder, ComputePassEncode
 import { makeTestGroup } from '../../../../../common/framework/test_group.js';
 import { getGPU } from '../../../../../common/util/navigator_gpu.js';
 import {
+  assert,
   kTypedArrayBufferViews,
   kTypedArrayBufferViewKeys,
+  memcpy,
   supportsImmediateData,
   unreachable,
 } from '../../../../../common/util/util.js';
@@ -135,8 +137,9 @@ function dispatchOrDraw(
 
 /**
  * Create a uniform buffer with output indices at 256-byte aligned offsets for dynamic binding.
- * This is used instead of other mechanisms to provide an output index (like firstVertex or immediates)
- * because it works across all shader stages without consuming the immediate data capability that we are actively testing.
+ * A uniform buffer with dynamic offsets is used to provide the output index because:
+ *  1. It works uniformly across all shader stages (compute, vertex, fragment).
+ *  2. It doesn't consume the immediate data capability that these tests are actively exercising.
  */
 function createOutputIndexBuffer(t: AllFeaturesMaxLimitsGPUTest, count: number): GPUBuffer {
   const buffer = t.createBufferTracked({
@@ -229,11 +232,18 @@ function runAndCheck(
   ) => void,
   expectedValues: number[]
 ) {
+  assert(expectedValues.length > 0, 'expectedValues must not be empty');
   const outputBuffer = t.createBufferTracked({
     size: expectedValues.length * 4,
     usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
   });
-  // Simple uniform for outIndex = 0. Dynamic offset is always [0].
+  // A dynamic-offset uniform buffer supplies outIndex = 0 here.
+  // We use a uniform buffer (rather than e.g. firstVertex via @builtin(vertex_index)) because:
+  //  - It works across all shader stages (compute, vertex, fragment).
+  //  - firstVertex is emulated via root constants on D3D12, which is the same mechanism
+  //    backing var<immediate>, so using it could mask bugs in the path under test.
+  // The pipeline layout declares hasDynamicOffset, so we must always pass a dynamic offset
+  // array — even though this simple helper only ever uses offset [0].
   const indexUniformBuffer = t.makeBufferWithContents(new Uint32Array([0]), GPUBufferUsage.UNIFORM);
 
   const bindGroup = t.device.createBindGroup({
@@ -339,7 +349,7 @@ g.test('basic_execution')
       encoderType,
       pipeline,
       encoder => {
-        encoder.setImmediates!(0, inputData, 0, inputData.length);
+        encoder.setImmediates!(0, inputData);
       },
       expected
     );
@@ -368,6 +378,8 @@ g.test('update_data')
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
 
+    // Uniform buffer with output indices [0, 1, 2] at 256-byte aligned offsets,
+    // used to direct each dispatch/draw step to a separate region of the output buffer.
     const indexUniformBuffer = createOutputIndexBuffer(t, 3);
 
     const bindGroup = t.device.createBindGroup({
@@ -388,7 +400,7 @@ g.test('update_data')
       dstOffset: number = 0
     ) => {
       pass.setBindGroup(0, bindGroup, [stepIndex * 256]);
-      pass.setImmediates!(dstOffset, data, 0, data.length);
+      pass.setImmediates!(dstOffset, data);
       dispatchOrDraw(encoderType, pass);
     };
 
@@ -453,6 +465,11 @@ g.test('pipeline_switch')
       immediateSizeB = 8;
     }
 
+    // Same source data for both cases; dataSize controls how many elements are written.
+    const immDataB = new Uint32Array([5, 6, 7, 8]);
+    const immDataSizeB = sameImmediateSize ? undefined : immediateSizeB / 4;
+    const expectedB = sameImmediateSize ? [5, 6, 7, 8] : [5, 6, 0, 0];
+
     const bindGroupLayout = t.device.createBindGroupLayout({
       entries: [
         {
@@ -481,15 +498,11 @@ g.test('pipeline_switch')
     const pipelineB = createPipeline(t, encoderType, wgslDeclB, copyCodeB, immediateSizeB, layoutB);
 
     const outputBuffer = t.createBufferTracked({
-      size: 32,
+      size: 16, // 4 u32s at outIndex 0
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
-    const indexUniformBuffer = createOutputIndexBuffer(t, 2);
+    const indexUniformBuffer = createOutputIndexBuffer(t, 1);
 
-    // Use a single bind group. Both pipelines have the same bind group layout,
-    // so the same bind group is compatible with both. This verifies that
-    // previously bound descriptor sets remain valid after a pipeline switch
-    // (important for Vulkan correctness).
     const bindGroup = t.device.createBindGroup({
       layout: bindGroupLayout,
       entries: [
@@ -498,32 +511,26 @@ g.test('pipeline_switch')
       ],
     });
 
-    const immData = new Uint32Array([1, 2, 3, 4]);
-
     const commandEncoder = t.device.createCommandEncoder();
     encodeForPassType(t, encoderType, commandEncoder, enc => {
-      // Pipeline A: set immediates [1, 2, 3, 4], dispatch/draw at outIndex 0.
+      // Only set bind group once between bind group compatible pipelines.
       setPipeline(encoderType, enc, pipelineA);
       enc.setBindGroup(0, bindGroup, [0]);
-      enc.setImmediates!(0, immData, 0, immData.length);
-      dispatchOrDraw(encoderType, enc);
+      enc.setImmediates!(0, new Uint32Array([1, 2, 3, 4]));
 
-      // Pipeline B: switch pipeline, update dynamic offset for outIndex 1.
+      // Switch to Pipeline B without re-setting the bind group.
+      // The bind group set under Pipeline A must remain valid.
       setPipeline(encoderType, enc, pipelineB);
-      enc.setBindGroup(0, bindGroup, [256]);
-      // We purposefully don't call setImmediates here to verify that the
-      // immediates set prior to the pipeline switch are preserved.
+      // Same source data; dataSize controls how many elements are written.
+      // Passing undefined for srcOffset/srcSize relies on WebIDL defaults (0 / array.length).
+      enc.setImmediates!(0, immDataB, undefined, immDataSizeB);
       dispatchOrDraw(encoderType, enc);
     });
 
     t.device.queue.submit([commandEncoder.finish()]);
 
-    // Pipeline A wrote [1, 2, 3, 4] at outIndex 0.
-    // Pipeline B wrote [1, 2, 3, 4] (same size) or [1, 2, 0, 0] (different size) at outIndex 1.
-    const expected = sameImmediateSize
-      ? new Uint32Array([1, 2, 3, 4, 1, 2, 3, 4])
-      : new Uint32Array([1, 2, 3, 4, 1, 2, 0, 0]);
-    t.expectGPUBufferValuesEqual(outputBuffer, expected);
+    // Pipeline B's draw used the bind group set under Pipeline A.
+    t.expectGPUBufferValuesEqual(outputBuffer, new Uint32Array(expectedB));
   });
 
 g.test('use_max_immediate_size')
@@ -575,7 +582,7 @@ g.test('use_max_immediate_size')
 
       setPipeline(encoderType, enc, pipeline);
       enc.setBindGroup(0, bindGroup, [0]);
-      enc.setImmediates!(0, data, 0, count);
+      enc.setImmediates!(0, data);
       dispatchOrDraw(encoderType, enc);
     });
 
@@ -590,19 +597,16 @@ g.test('typed_array_arguments')
       .combine('typedArray', kTypedArrayBufferViewKeys)
       .combine('encoderType', kProgrammableEncoderTypes)
       .beginSubcases()
-      .combine('dataOffset', [undefined, 0, 2])
-      .combine('dataSize', [undefined, 2])
-      .filter(t => {
-        // WebGPU requires the byte size of the provided data to `setImmediates` to be a multiple of 4.
-        const elementSize = kTypedArrayBufferViews[t.typedArray].BYTES_PER_ELEMENT;
-        const actualDataOffset = t.dataOffset ?? 0;
-
-        // Let's test a larger size to accommodate BigUint64Array/BigInt64Array matrices more easily
-        const maxElementsIn64Bytes = 64 / elementSize;
-        const actualDataSize = t.dataSize ?? maxElementsIn64Bytes - actualDataOffset;
-        const byteSize = actualDataSize * elementSize;
-
-        return byteSize <= 64 && byteSize % 4 === 0;
+      .expandWithParams(function* (p) {
+        const elementSize = kTypedArrayBufferViews[p.typedArray].BYTES_PER_ELEMENT;
+        // Smallest element count that produces a 4-byte-aligned byte size.
+        const smallCount = Math.max(1, Math.ceil(4 / elementSize));
+        yield { dataOffset: undefined, dataSize: undefined };
+        yield { dataOffset: 0, dataSize: undefined };
+        yield { dataOffset: smallCount, dataSize: undefined };
+        yield { dataOffset: undefined, dataSize: smallCount };
+        yield { dataOffset: 0, dataSize: smallCount };
+        yield { dataOffset: smallCount, dataSize: smallCount };
       })
   )
   .fn(t => {
@@ -612,8 +616,9 @@ g.test('typed_array_arguments')
     const Ctor = kTypedArrayBufferViews[typedArray];
     const elementSize = Ctor.BYTES_PER_ELEMENT;
 
-    // Use a baseline of 16 u32s (64 bytes).
-    const immediateSize = 64;
+    // 64 bytes of immediate data (4 x vec4<u32>). This size must match the WGSL struct below.
+    const kImmediateByteSize = 64;
+    const kImmediateU32Count = kImmediateByteSize / 4;
     const wgslDecl = `
       struct ImmediateData {
         m0: vec4<u32>,
@@ -641,31 +646,39 @@ g.test('typed_array_arguments')
       output[14] = data.m3.z;
       output[15] = data.m3.w;
     `;
-    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, immediateSize);
+    const pipeline = createPipeline(t, encoderType, wgslDecl, copyCode, kImmediateByteSize);
 
     const actualDataOffset = dataOffset ?? 0;
-    // We want to write `dataSize` elements. If dataSize is undefined, it defaults to:
-    // array.length - dataOffset
-    const maxElementsIn64Bytes = 64 / elementSize;
-    const actualDataSize = dataSize ?? maxElementsIn64Bytes - actualDataOffset;
-
-    // Create a typed array buffer. If dataSize is undefined, we shouldn't add padding
-    // because setImmediates uses the rest of the array, which would exceed our intended size or break the % 4 rule.
-    const paddingElements = dataSize === undefined ? 0 : 4;
+    const maxElements = kImmediateByteSize / elementSize;
+    const actualDataSize = dataSize ?? maxElements - actualDataOffset;
+
+    // Validate that the byte size is 4-byte aligned and fits in the immediate block.
+    const byteSize = actualDataSize * elementSize;
+    assert(
+      byteSize <= kImmediateByteSize && byteSize % 4 === 0,
+      `byteSize ${byteSize} must be <= ${kImmediateByteSize} and a multiple of 4`
+    );
+
+    // When dataSize is explicit, add padding elements to verify setImmediates
+    // respects the dataSize boundary and doesn't read beyond it.
+    // When dataSize is undefined, no padding since setImmediates reads to the end of the array.
+    const paddingElements = dataSize !== undefined ? 4 : 0;
     const arr = new Ctor(actualDataOffset + actualDataSize + paddingElements);
     const view = new DataView(arr.buffer);
 
+    // Fill the data region with a recognizable byte pattern.
     for (let i = 0; i < actualDataSize; i++) {
-      // Just fill some bytes. E.g., repeating byte pattern.
       for (let b = 0; b < elementSize; b++) {
         view.setUint8((actualDataOffset + i) * elementSize + b, 0x10 + b + i);
       }
     }
 
-    const commandEncoder = t.device.createCommandEncoder();
+    // Baseline clear pattern for the full immediate block.
+    const clearData = new Uint32Array(kImmediateU32Count);
+    for (let i = 0; i < kImmediateU32Count; i++) clearData[i] = 0xaaaaaaaa + i * 0x11111111;
 
     const outputBuffer = t.createBufferTracked({
-      size: 64,
+      size: kImmediateByteSize,
       usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
     });
     const indexUniformBuffer = createOutputIndexBuffer(t, 1);
@@ -677,38 +690,32 @@ g.test('typed_array_arguments')
       ],
     });
 
+    const commandEncoder = t.device.createCommandEncoder();
     encodeForPassType(t, encoderType, commandEncoder, enc => {
       setPipeline(encoderType, enc, pipeline);
       enc.setBindGroup(0, bindGroup, [0]);
 
-      // Initialize pipeline storage to a clear pattern mapping to up to 64 bytes
-      const clearData = new Uint32Array(16);
-      for (let i = 0; i < 16; i++) clearData[i] = 0xaaaaaaaa + i * 0x11111111;
+      // Initialize immediates to the baseline clear pattern.
       enc.setImmediates!(0, clearData);
 
-      if (dataSize === undefined) {
-        if (dataOffset === undefined) {
-          enc.setImmediates!(0, arr);
-        } else {
-          enc.setImmediates!(0, arr, dataOffset);
-        }
-      } else {
-        enc.setImmediates!(0, arr, dataOffset, dataSize);
-      }
+      // Overwrite with typed array data using the parametrized offset/size.
+      // Passing undefined for dataOffset/dataSize uses the WebIDL default (0 / array.length).
+      enc.setImmediates!(0, arr, dataOffset, dataSize);
 
       dispatchOrDraw(encoderType, enc);
     });
     t.device.queue.submit([commandEncoder.finish()]);
 
-    const expected = new Uint32Array(16);
-    for (let i = 0; i < 16; i++) expected[i] = 0xaaaaaaaa + i * 0x11111111;
-    const expectedView = new Uint8Array(expected.buffer);
-
-    for (let i = 0; i < actualDataSize; i++) {
-      for (let b = 0; b < elementSize; b++) {
-        expectedView[i * elementSize + b] = 0x10 + b + i;
-      }
-    }
+    // Build expected: baseline pattern with the written typed-array bytes overlaid at offset 0.
+    const expected = new Uint32Array(clearData);
+    memcpy(
+      {
+        src: arr.buffer,
+        start: actualDataOffset * elementSize,
+        length: actualDataSize * elementSize,
+      },
+      { dst: expected.buffer, start: 0 }
+    );
 
     t.expectGPUBufferValuesEqual(outputBuffer, expected);
   });
@@ -733,11 +740,11 @@ g.test('multiple_updates_before_draw_or_dispatch')
       pipeline,
       encoder => {
         // 1. Set all to [1, 2, 3, 4]
-        encoder.setImmediates!(0, new Uint32Array([1, 2, 3, 4]), 0, 4);
+        encoder.setImmediates!(0, new Uint32Array([1, 2, 3, 4]));
         // 2. Update middle two to [5, 6] -> [1, 5, 6, 4]
-        encoder.setImmediates!(4, new Uint32Array([5, 6]), 0, 2);
+        encoder.setImmediates!(4, new Uint32Array([5, 6]));
         // 3. Update last to [7] -> [1, 5, 6, 7]
-        encoder.setImmediates!(12, new Uint32Array([7]), 0, 1);
+        encoder.setImmediates!(12, new Uint32Array([7]));
       },
       [1, 5, 6, 7]
     );
@@ -775,8 +782,8 @@ g.test('render_pass_and_bundle_mix')
     const bundleEncoder = t.device.createRenderBundleEncoder({ colorFormats: ['r32uint'] });
     bundleEncoder.setPipeline(pipeline);
     bundleEncoder.setBindGroup(0, bindGroup, [0]);
-    bundleEncoder.setImmediates!(0, new Uint32Array([1, 10]), 0, 2);
-    bundleEncoder.draw(1, 1, 0, 0);
+    bundleEncoder.setImmediates!(0, new Uint32Array([1, 10]));
+    bundleEncoder.draw(1);
     const bundle = bundleEncoder.finish();
 
     const renderTargetTexture = t.createTextureTracked({
@@ -802,8 +809,8 @@ g.test('render_pass_and_bundle_mix')
     // Pass: Set [2, 20], Draw (Index 1)
     pass.setPipeline(pipeline);
     pass.setBindGroup(0, bindGroup, [256]);
-    pass.setImmediates!(0, new Uint32Array([2, 20]), 0, 2);
-    pass.draw(1, 1, 1, 0);
+    pass.setImmediates!(0, new Uint32Array([2, 20]));
+    pass.draw(1);
 
     pass.end();
     t.device.queue.submit([commandEncoder.finish()]);
@@ -841,16 +848,16 @@ g.test('render_bundle_isolation')
     const bundleEncoderA = t.device.createRenderBundleEncoder({ colorFormats: ['r32uint'] });
     bundleEncoderA.setPipeline(pipeline);
     bundleEncoderA.setBindGroup(0, bindGroup, [0]);
-    bundleEncoderA.setImmediates!(0, new Uint32Array([1, 2]), 0, 2);
-    bundleEncoderA.draw(1, 1, 0, 0);
+    bundleEncoderA.setImmediates!(0, new Uint32Array([1, 2]));
+    bundleEncoderA.draw(1);
     const bundleA = bundleEncoderA.finish();
 
     // Bundle B: Set [3, 4], Draw (Index 1)
     const bundleEncoderB = t.device.createRenderBundleEncoder({ colorFormats: ['r32uint'] });
     bundleEncoderB.setPipeline(pipeline);
     bundleEncoderB.setBindGroup(0, bindGroup, [256]);
-    bundleEncoderB.setImmediates!(0, new Uint32Array([3, 4]), 0, 2);
-    bundleEncoderB.draw(1, 1, 1, 0);
+    bundleEncoderB.setImmediates!(0, new Uint32Array([3, 4]));
+    bundleEncoderB.draw(1);
     const bundleB = bundleEncoderB.finish();
 
     const renderTargetTexture = t.createTextureTracked({

From af03a72cda32de20db8ac82f799689a0e33afeb6 Mon Sep 17 00:00:00 2001
From: shaoboyan <shaoboyan@microsoft.com>
Date: Mon, 16 Mar 2026 14:50:54 +0800
Subject: [PATCH 5/5] Address comments

---
 .../command_buffer/programmable/immediate.spec.ts  | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
index 6db9ab8f7c1b..13512fbac35a 100644
--- a/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
+++ b/src/webgpu/api/operation/command_buffer/programmable/immediate.spec.ts
@@ -442,8 +442,7 @@ g.test('pipeline_switch')
     // Pipeline A always uses vec4<u32> (16 bytes).
     const wgslDeclA = 'var<immediate> data: vec4<u32>;';
     const copyCodeA = `
-      let base = outIndex * 4;
-      output[base] = data.x; output[base+1] = data.y; output[base+2] = data.z; output[base+3] = data.w;
+      output[0] = data.x; output[1] = data.y; output[2] = data.z; output[3] = data.w;
     `;
 
     let wgslDeclB: string;
@@ -459,8 +458,7 @@ g.test('pipeline_switch')
       // Pipeline B uses vec2<u32> (8 bytes) — different/incompatible layout.
       wgslDeclB = 'var<immediate> data: vec2<u32>;';
       copyCodeB = `
-        let base = outIndex * 4;
-        output[base] = data.x; output[base+1] = data.y; output[base+2] = 0u; output[base+3] = 0u;
+        output[0] = data.x; output[1] = data.y; output[2] = 0u; output[3] = 0u;
       `;
       immediateSizeB = 8;
     }
@@ -667,10 +665,10 @@ g.test('typed_array_arguments')
     const view = new DataView(arr.buffer);
 
     // Fill the data region with a recognizable byte pattern.
-    for (let i = 0; i < actualDataSize; i++) {
-      for (let b = 0; b < elementSize; b++) {
-        view.setUint8((actualDataOffset + i) * elementSize + b, 0x10 + b + i);
-      }
+    const dataByteOffset = actualDataOffset * elementSize;
+    const dataByteSize = actualDataSize * elementSize;
+    for (let byte = 0; byte < dataByteSize; byte++) {
+      view.setUint8(dataByteOffset + byte, 0x10 + byte);
     }
 
     // Baseline clear pattern for the full immediate block.