[HLSL][Matrix] Make Matrix InitListExprs and AST row-major order, and respect /Zpr and /Zpc in codegen (llvm#182904)

Icohedron · farzonl · web-flow · commit ae363d50ad29 · 2026-03-02T09:46:40.000-08:00
Fixes llvm#166410 and llvm#181902 This PR makes matrix initializer lists be kept in row-major order in InitListExpr and the AST for HLSL by not reordering the element indices in `InitListChecker::CheckMatrixType` in `clang/lib/Sema/SemaInit.cpp`. This PR also makes the codegen respect /Zpr and /Zpc during codegen for matrix initializer lists by adding a vector shuffle to `VisitInitListExpr` in `clang/lib/CodeGen/CGExprScalar.cpp`. Assisted-by: claude-opus-4.6 --------- Co-authored-by: Farzon Lotfi <farzonl@gmail.com>
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
@@ -4418,6 +4418,45 @@ class ConstantMatrixType final : public MatrixType {
     return getNumRows() * getNumColumns();
   }
 
+  /// Returns the row-major flattened index of a matrix element located at row
+  /// \p Row, and column \p Column
+  unsigned getRowMajorFlattenedIndex(unsigned Row, unsigned Column) const {
+    return Row * NumColumns + Column;
+  }
+
+  /// Returns the column-major flattened index of a matrix element located at
+  /// row \p Row, and column \p Column
+  unsigned getColumnMajorFlattenedIndex(unsigned Row, unsigned Column) const {
+    return Column * NumRows + Row;
+  }
+
+  /// Returns the flattened index of a matrix element located at
+  /// row \p Row, and column \p Column. If \p IsRowMajor is true, returns the
+  /// row-major order flattened index. Otherwise, returns the column-major order
+  /// flattened index.
+  unsigned getFlattenedIndex(unsigned Row, unsigned Column,
+                             bool IsRowMajor = false) {
+    return IsRowMajor ? getRowMajorFlattenedIndex(Row, Column)
+                      : getColumnMajorFlattenedIndex(Row, Column);
+  }
+
+  /// Given a column-major flattened index \p ColumnMajorIdx, return the
+  /// equivalent row-major flattened index.
+  unsigned
+  mapColumnMajorToRowMajorFlattenedIndex(unsigned ColumnMajorIdx) const {
+    unsigned Column = ColumnMajorIdx / NumRows;
+    unsigned Row = ColumnMajorIdx % NumRows;
+    return Row * NumColumns + Column;
+  }
+
+  /// Given a row-major flattened index \p RowMajorIdx, return the equivalent
+  /// column-major flattened index.
+  unsigned mapRowMajorToColumnMajorFlattenedIndex(unsigned RowMajorIdx) const {
+    unsigned Row = RowMajorIdx / NumColumns;
+    unsigned Column = RowMajorIdx % NumColumns;
+    return Column * NumRows + Row;
+  }
+
   void Profile(llvm::FoldingSetNodeID &ID) {
     Profile(ID, getElementType(), getNumRows(), getNumColumns(),
             getTypeClass());
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2450,6 +2450,20 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) {
     llvm::Value *Init = llvm::Constant::getNullValue(EltTy);
     V = Builder.CreateInsertElement(V, Init, Idx, "vecinit");
   }
+
+  // Matrix initializer lists are in row-major order but the memory layout for
+  // codegen is determined by the -fmatrix-memory-layout flag (default:
+  // column-major). When the memory layout is column-major, we need to shuffle
+  // the elements from row-major to column-major order.
+  if (const auto *MT = E->getType()->getAs<ConstantMatrixType>();
+      MT && CGF.getLangOpts().getDefaultMatrixMemoryLayout() ==
+                LangOptions::MatrixMemoryLayout::MatrixColMajor) {
+    SmallVector<int, 16> Mask;
+    for (unsigned I = 0, N = MT->getNumElementsFlattened(); I < N; ++I)
+      Mask.push_back(MT->mapColumnMajorToRowMajorFlattenedIndex(I));
+    V = Builder.CreateShuffleVector(V, Mask, "matrix.rowmajor2colmajor");
+  }
+
   return V;
 }
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
@@ -1910,17 +1910,16 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity,
   QualType ElemTy = MT->getElementType();
 
   Index = 0;
-  InitializedEntity ElemEnt =
+  InitializedEntity Element =
       InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);
 
   while (Index < IList->getNumInits()) {
     // Not a sublist: just consume directly.
-    unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() +
-                             (Index / MT->getNumRows());
-    ElemEnt.setElementIndex(ColMajorIndex);
-    CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList,
+    // Note: In HLSL, elements of the InitListExpr are in row-major order, so no
+    // change is needed to the Index.
+    Element.setElementIndex(Index);
+    CheckSubElementType(Element, IList, ElemTy, Index, StructuredList,
                         StructuredIndex);
-    ++Index;
   }
 }
 
diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl
diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
diff --git a/clang/test/AST/HLSL/matrix-init-list-row-major.hlsl b/clang/test/AST/HLSL/matrix-init-list-row-major.hlsl
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -finclude-default-header -o - %s | FileCheck %s
+
+// This test verifies that matrix initializer lists in HLSL use row-major
+// element ordering. The elements in the AST InitListExpr remain in
+// row-major order as written in the source code.
+
+// The AST InitListExpr preserves this row-major source order.
+// CHECK: VarDecl {{.*}} m2x2 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 2.000000e+00
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral {{.*}} 'float' 4.000000e+00
+export void test_2x2() {
+  float2x2 m2x2 = {1.0, 2.0, 3.0, 4.0};
+}
+
+// CHECK: VarDecl {{.*}} m2x3 'int2x3':'matrix<int, 2, 3>' cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'int2x3':'matrix<int, 2, 3>'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 5
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+export void test_2x3() {
+  int2x3 m2x3 = {1, 2, 3, 4, 5, 6};
+}
+
+// CHECK: VarDecl {{.*}} m3x2 'bool3x2':'matrix<bool, 3, 2>' cinit
+// CHECK-NEXT: InitListExpr {{.*}} 'bool3x2':'matrix<bool, 3, 2>'
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' false
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' false
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+// CHECK-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+export void test_3x2() {
+  bool3x2 m3x2 = {true, false, false, true, true, true};
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
@@ -13,7 +13,7 @@ float3x2 case1() {
   // vec[3] = 1
   // vec[4] = 3
   // vec[5] = 5
-  return float3x2(0, 1, 
+  return float3x2(0, 1,
                   2, 3,
                   4, 5);
 }
@@ -24,25 +24,26 @@ RWStructuredBuffer<float> In;
 // CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]]
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]]
-// CHECK-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]]
-// CHECK-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]]
-// CHECK-NEXT:    [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR4]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR4]]
+// CHECK-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR4]]
+// CHECK-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR4]]
+// CHECK-NEXT:    [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR4]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[CALL]], align 4
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0
-// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[CALL1]], align 4
 // CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[CALL2]], align 4
 // CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[CALL3]], align 4
 // CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3
-// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[CALL4]], align 4
 // CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4
 // CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5
-// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+// CHECK-NEXT:    [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <6 x float> [[VECINIT10]], <6 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+// CHECK-NEXT:    ret <6 x float> [[MATRIX_ROWMAJOR2COLMAJOR]]
 //
 float3x2 case2() {
   // vec[0] = Call
@@ -51,7 +52,7 @@ float3x2 case2() {
   // vec[3] = Call1
   // vec[4] = Call3
   // vec[5] = Call5
-  return float3x2(In[0], In[1], 
+  return float3x2(In[0], In[1],
                   In[2], In[3],
                   In[4], In[5]);
 }
@@ -68,28 +69,29 @@ float3x2 case2() {
 // CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 1
 // CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 2
 // CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2
-// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
-// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 0
 // CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
-// CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
+// CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 1
 // CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4
 // CHECK-NEXT:    [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
 // CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2
 // CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5
-// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+// CHECK-NEXT:    [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <6 x float> [[VECINIT10]], <6 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+// CHECK-NEXT:    ret <6 x float> [[MATRIX_ROWMAJOR2COLMAJOR]]
 //
 float3x2 case3(float3 a, float3 b) {
  // vec[0] = A[0]
- // vec[1] = A[2]
- // vec[2] = B[1]
- // vec[3] = A[1]
- // vec[4] = B[0]
+ // vec[1] = A[1]
+ // vec[2] = A[2]
+ // vec[3] = B[0]
+ // vec[4] = B[1]
  // vec[5] = B[2]
   return float3x2(a,b);
 }
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes \
+// RUN:   -emit-llvm -finclude-default-header -o - %s | FileCheck %s --check-prefix=CHECK,COL-CHECK
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes \
+// RUN:   -emit-llvm -finclude-default-header -fmatrix-memory-layout=row-major -o - %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK,ROW-CHECK
+
+// Verify that matrix initializer lists store elements in the correct memory
+// layout. The initializer list {1,2,3,4,5,6} for a float2x3 (2 rows, 3 cols)
+// is in row-major order: row0=[1,2,3], row1=[4,5,6].
+//
+// With column-major (default) memory layout, the stored vector should be
+// reordered to: col0=[1,4], col1=[2,5], col2=[3,6] = <1,4,2,5,3,6>.
+//
+// With row-major memory layout, the stored vector stays as-is: <1,2,3,4,5,6>.
+
+export float test_row0_col2() {
+// CHECK-LABEL: define {{.*}} float @_Z14test_row0_col2v
+// COL-CHECK: store <6 x float> <float 1.000000e+00, float 4.000000e+00, float 2.000000e+00, float 5.000000e+00, float 3.000000e+00, float 6.000000e+00>
+// COL-CHECK: extractelement <6 x float> %{{.*}}, i32 4
+// ROW-CHECK: store <6 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>
+// ROW-CHECK: extractelement <6 x float> %{{.*}}, i32 2
+  float2x3 M = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  // Row 0, Col 2 in row-major is the 3rd element = 3.0
+  return M[0][2];
+}
+
+export float test_row1_col0() {
+// CHECK-LABEL: define {{.*}} float @_Z14test_row1_col0v
+// COL-CHECK: store <6 x float> <float 1.000000e+00, float 4.000000e+00, float 2.000000e+00, float 5.000000e+00, float 3.000000e+00, float 6.000000e+00>
+// COL-CHECK: extractelement <6 x float> %{{.*}}, i32 1
+// ROW-CHECK: store <6 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00>
+// ROW-CHECK: extractelement <6 x float> %{{.*}}, i32 3
+  float2x3 M = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  // Row 1, Col 0 in row-major is the 4th element = 4.0
+  return M[1][0];
+}
+
+// Verify the shuffle is emitted for non-constant init lists when the memory
+// layout is column-major, and not emitted when it is row-major.
+
+export float2x3 test_dynamic(float a, float b, float c,
+                             float d, float e, float f) {
+// CHECK-LABEL: define {{.*}} <6 x float> @_Z12test_dynamicffffff
+// CHECK: [[A:%.*]] = load float, ptr %a.addr
+// CHECK: [[VECINIT0:%.*]] = insertelement <6 x float> poison, float [[A]], i32 0
+// CHECK: [[B:%.*]] = load float, ptr %b.addr
+// CHECK: [[VECINIT1:%.*]] = insertelement <6 x float> [[VECINIT0]], float [[B]], i32 1
+// CHECK: [[C:%.*]] = load float, ptr %c.addr
+// CHECK: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT1]], float [[C]], i32 2
+// CHECK: [[D:%.*]] = load float, ptr %d.addr
+// CHECK: [[VECINIT3:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[D]], i32 3
+// CHECK: [[E:%.*]] = load float, ptr %e.addr
+// CHECK: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT3]], float [[E]], i32 4
+// CHECK: [[F:%.*]] = load float, ptr %f.addr
+// CHECK: [[VECINIT5:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[F]], i32 5
+// COL-CHECK: shufflevector <6 x float> [[VECINIT5]], <6 x float> poison, <6 x i32> <i32 0, i32 3, i32 1, i32 4, i32 2, i32 5>
+// ROW-CHECK-NOT: shufflevector
+// ROW-CHECK: store <6 x float> [[VECINIT5]], ptr
+  return (float2x3){a, b, c, d, e, f};
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl
@@ -40,15 +40,17 @@ float4 fn(float2x2 m) {
 // CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i32> [[TMP0]], i64 0
 // CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT]], i32 0
 // CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
-// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[TMP1]], i64 2
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1
 // CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
-// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 1
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 2
 // CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[VECEXT3]], i32 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16
 // CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3
 // CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[VECEXT5]], i32 3
-// CHECK-NEXT:    store <4 x i32> [[VECINIT6]], ptr [[M]], align 4
+// COL-CHECK-NEXT:    [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <4 x i32> [[VECINIT6]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+// COL-CHECK-NEXT:    store <4 x i32> [[MATRIX_ROWMAJOR2COLMAJOR]], ptr [[M]], align 4
+// ROW-CHECK-NEXT:    store <4 x i32> [[VECINIT6]], ptr [[M]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[M]], align 4
 // CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 //
@@ -68,7 +70,9 @@ int2x2 fn(int4 v) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[V_ADDR]], align 8
 // CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1
 // CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1
-// CHECK-NEXT:    ret <2 x i32> [[VECINIT2]]
+// COL-CHECK-NEXT:    [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <2 x i32> [[VECINIT2]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+// COL-CHECK-NEXT:    ret <2 x i32> [[MATRIX_ROWMAJOR2COLMAJOR]]
+// ROW-CHECK-NEXT:    ret <2 x i32> [[VECINIT2]]
 //
 int1x2 fn1(int2 v) {
     return v;
@@ -92,7 +96,9 @@ int1x2 fn1(int2 v) {
 // CHECK-NEXT:    [[LOADEDV4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i1>
 // CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x i1> [[LOADEDV4]], i64 2
 // CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <3 x i1> [[VECINIT3]], i1 [[VECEXT5]], i32 2
-// CHECK-NEXT:    ret <3 x i1> [[VECINIT6]]
+// COL-CHECK-NEXT:    [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <3 x i1> [[VECINIT6]], <3 x i1> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// COL-CHECK-NEXT:    ret <3 x i1> [[MATRIX_ROWMAJOR2COLMAJOR]]
+// ROW-CHECK-NEXT:    ret <3 x i1> [[VECINIT6]]
 //
 bool3x1 fn2(bool3 b) {
     return b;
diff --git a/clang/test/CodeGenHLSL/BoolMatrix.hlsl b/clang/test/CodeGenHLSL/BoolMatrix.hlsl