diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d0330315a..f99a2dfeaa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,9 @@ with the exception that minor releases may include breaking changes.
 
 ### Added
 
+- ✨ Add a `fuse-two-qubit-unitary-runs` pass
+  for fusing compile-time two-qubit unitary windows via Weyl/KAK resynthesis
+  ([#1655]) ([**@simon1hofmann**])
 - ✨ Add a `fuse-single-qubit-unitary-runs` pass
   for fusing compile-time single-qubit unitary runs via Euler resynthesis
   ([#1672]) ([**@simon1hofmann**], [**@burgholzer**])
@@ -631,6 +634,7 @@ changelogs._
 [#1664]: https://github.com/munich-quantum-toolkit/core/pull/1664
 [#1662]: https://github.com/munich-quantum-toolkit/core/pull/1662
 [#1660]: https://github.com/munich-quantum-toolkit/core/pull/1660
+[#1655]: https://github.com/munich-quantum-toolkit/core/pull/1655
 [#1652]: https://github.com/munich-quantum-toolkit/core/pull/1652
 [#1638]: https://github.com/munich-quantum-toolkit/core/pull/1638
 [#1637]: https://github.com/munich-quantum-toolkit/core/pull/1637
diff --git a/mlir/include/mlir/Compiler/CompilerPipeline.h b/mlir/include/mlir/Compiler/CompilerPipeline.h
index 54d9f039de..1844a75dd7 100644
--- a/mlir/include/mlir/Compiler/CompilerPipeline.h
+++ b/mlir/include/mlir/Compiler/CompilerPipeline.h
@@ -49,6 +49,18 @@ struct QuantumCompilerConfig {
 
   /// Enable Hadamard lifting
   bool enableHadamardLifting = false;
+
+  /// Comma-separated native gate menu. Recognised tokens: `u`, `x`, `sx`,
+  /// `rz` (or `p`), `rx`, `ry`, `r`, `cx`, `cz`, `rzz`.
+  /// Illustrative menus (use `cx` or `cz` as the entangler, or
+  /// both):
+  /// - `"x,sx,rz,cx"` / `"x,sx,rz,cz"` — IBM basic (no fractional 2q)
+  /// - `"x,sx,rz,rx,rzz,cx"` / `"...,cz"` — IBM fractional
+  /// - `"u,cx"` / `"u,cz"` — generic single-qubit U3 + CX/CZ
+  /// - `"r,cz"` — IQM-style default
+  /// - `"rx,rz,cx"`, `"rx,ry,cz"`, `"ry,rz,cx"` — supported RX/RY/RZ pairs plus
+  /// entangler
+  std::string nativeGates;
 };
 
 /**
@@ -84,7 +96,7 @@ struct CompilationRecord {
  * 2. QC cleanup pipeline
  * 3. QCO dialect (value semantics) - enables SSA-based optimizations
  * 4. QCO cleanup pipeline
- * 5. Quantum optimization passes
+ * 5. Optimization and native gate synthesis
  * 6. QCO cleanup pipeline
  * 7. QC dialect - converted back for backend lowering
  * 8. QC cleanup pipeline
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.h
new file mode 100644
index 0000000000..a2cc1eb753
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "UnitaryMatrices.h"
+#include "WeylDecomposition.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/SmallVector.h>
+
+#include <array>
+#include <complex>
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+namespace mlir::qco::decomposition {
+
+/// Intermediate single-qubit ``2×2`` unitaries produced while expanding a
+/// two-qubit basis decomposition.
+using TwoQubitLocalUnitaryList = llvm::SmallVector<Matrix2x2, 8>;
+
+/**
+ * Result of a two-qubit basis decomposition expressed as raw single-qubit
+ * factors interleaved with a fixed number of basis-gate (entangler) uses.
+ *
+ * The factors are stored in emission order. For `i` in `[0, numBasisUses)` the
+ * pair `(singleQubitFactors[2*i], singleQubitFactors[2*i + 1])` is applied to
+ * qubits `1` and `0` respectively, followed by one entangler. The final pair
+ * `(singleQubitFactors[2*numBasisUses], singleQubitFactors[2*numBasisUses+1])`
+ * is applied after the last entangler. The list therefore has length
+ * `2 * (numBasisUses + 1)`.
+ */
+struct TwoQubitNativeDecomposition {
+  /// Number of basis-gate (entangler) uses.
+  std::uint8_t numBasisUses = 0;
+  /// Single-qubit factors in emission order (see struct comment).
+  TwoQubitLocalUnitaryList singleQubitFactors;
+  /// Residual global phase (radians) not represented by factors/entanglers.
+  double globalPhase = 0.0;
+};
+
+/**
+ * Decomposer that must be initialized with a two-qubit basis gate that will
+ * be used to generate a circuit equivalent to a canonical gate (RXX+RYY+RZZ).
+ *
+ * @note Adapted from TwoQubitBasisDecomposer in the IBM Qiskit framework.
+ *       (C) Copyright IBM 2023
+ *
+ *       This code is licensed under the Apache License, Version 2.0. You may
+ *       obtain a copy of this license in the LICENSE.txt file in the root
+ *       directory of this source tree or at
+ *       https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ *       Any modifications or derivative works of this code must retain this
+ *       copyright notice, and modified files need to carry a notice
+ *       indicating that they have been altered from the originals.
+ */
+class TwoQubitBasisDecomposer {
+public:
+  /**
+   * Create decomposer that allows two-qubit decompositions based on the
+   * specified entangler matrix.
+   * This entangler will appear between 0 and 3 times in each decomposition.
+   * The 4x4 matrix must be in MQT operand order (qubit 0 = MSB).
+   */
+  [[nodiscard]] static TwoQubitBasisDecomposer
+  create(const Matrix4x4& basisMatrix, double basisFidelity);
+
+  /**
+   * Perform decomposition using the basis gate of this decomposer.
+   *
+   * @param targetDecomposition Prepared Weyl decomposition of unitary matrix
+   *                            to be decomposed.
+   * @param numBasisGateUses Force use of given number of basis gates. When
+   *                         unset, the optimal count is selected from the
+   *                         Hilbert-Schmidt traces.
+   * @return The single-qubit factors and entangler count, or `std::nullopt`
+   *         when more than one basis gate would be required but the basis gate
+   *         is not super-controlled.
+   */
+  [[nodiscard]] std::optional<TwoQubitNativeDecomposition> twoQubitDecompose(
+      const decomposition::TwoQubitWeylDecomposition& targetDecomposition,
+      std::optional<std::uint8_t> numBasisGateUses) const;
+
+protected:
+  // NOLINTBEGIN(modernize-pass-by-value)
+  /**
+   * Constructs decomposer instance.
+   */
+  TwoQubitBasisDecomposer(
+      double basisFidelity,
+      const decomposition::TwoQubitWeylDecomposition& basisDecomposer,
+      bool isSuperControlled, const Matrix2x2& u0l, const Matrix2x2& u0r,
+      const Matrix2x2& u1l, const Matrix2x2& u1ra, const Matrix2x2& u1rb,
+      const Matrix2x2& u2la, const Matrix2x2& u2lb, const Matrix2x2& u2ra,
+      const Matrix2x2& u2rb, const Matrix2x2& u3l, const Matrix2x2& u3r,
+      const Matrix2x2& q0l, const Matrix2x2& q0r, const Matrix2x2& q1la,
+      const Matrix2x2& q1lb, const Matrix2x2& q1ra, const Matrix2x2& q1rb,
+      const Matrix2x2& q2l, const Matrix2x2& q2r)
+      : basisFidelity{basisFidelity}, basisDecomposer{basisDecomposer},
+        isSuperControlled{isSuperControlled}, u0l{u0l}, u0r{u0r}, u1l{u1l},
+        u1ra{u1ra}, u1rb{u1rb}, u2la{u2la}, u2lb{u2lb}, u2ra{u2ra}, u2rb{u2rb},
+        u3l{u3l}, u3r{u3r}, q0l{q0l}, q0r{q0r}, q1la{q1la}, q1lb{q1lb},
+        q1ra{q1ra}, q1rb{q1rb}, q2l{q2l}, q2r{q2r} {}
+  // NOLINTEND(modernize-pass-by-value)
+
+  /**
+   * Calculate decompositions when no basis gate is required.
+   *
+   * Decompose target :math:`\sim U_d(x, y, z)` with 0 uses of the
+   * basis gate. Result :math:`U_r` has trace:
+   *
+   * .. math::
+   *
+   *     \Big\vert\text{Tr}(U_r\cdot U_\text{target}^{\dag})\Big\vert =
+   *     4\Big\vert (\cos(x)\cos(y)\cos(z)+ j \sin(x)\sin(y)\sin(z)\Big\vert
+   *
+   * which is optimal for all targets and bases
+   */
+  [[nodiscard]] static TwoQubitLocalUnitaryList
+  decomp0(const decomposition::TwoQubitWeylDecomposition& target);
+
+  /**
+   * Calculate decompositions when one basis gate is required.
+   *
+   * Decompose target :math:`\sim U_d(x, y, z)` with 1 use of the
+   * basis gate :math:`\sim U_d(a, b, c)`. Result :math:`U_r` has trace:
+   *
+   * .. math::
+   *
+   *     \Big\vert\text{Tr}(U_r \cdot U_\text{target}^{\dag})\Big\vert =
+   *     4\Big\vert \cos(x-a)\cos(y-b)\cos(z-c) + j
+   *     \sin(x-a)\sin(y-b)\sin(z-c)\Big\vert
+   *
+   * which is optimal for all targets and bases with ``z==0`` or ``c==0``.
+   */
+  [[nodiscard]] TwoQubitLocalUnitaryList
+  decomp1(const decomposition::TwoQubitWeylDecomposition& target) const;
+
+  /**
+   * Calculate decompositions when two basis gates are required.
+   *
+   * Decompose target :math:`\sim U_d(x, y, z)` with 2 uses of the
+   * basis gate.
+   *
+   * For supercontrolled basis :math:`\sim U_d(\pi/4, b, 0)`, all b, result
+   * :math:`U_r` has trace
+   *
+   * .. math::
+   *
+   *     \Big\vert\text{Tr}(U_r \cdot U_\text{target}^\dag) \Big\vert =
+   * 4\cos(z)
+   *
+   * which is the optimal approximation for basis of CNOT-class
+   * :math:`\sim U_d(\pi/4, 0, 0)` or DCNOT-class
+   * :math:`\sim U_d(\pi/4, \pi/4, 0)` and any target. It may be sub-optimal
+   * for :math:`b \neq 0` (i.e. there exists an exact decomposition for any
+   * target using :math:`B \sim U_d(\pi/4, \pi/8, 0)`, but it may not be this
+   * decomposition). This is an exact decomposition for supercontrolled basis
+   * and target :math:`\sim U_d(x, y, 0)`. No guarantees for
+   * non-supercontrolled basis.
+   */
+  [[nodiscard]] TwoQubitLocalUnitaryList decomp2Supercontrolled(
+      const decomposition::TwoQubitWeylDecomposition& target) const;
+
+  /**
+   * Calculate decompositions when three basis gates are required.
+   *
+   * Decompose target with 3 uses of the basis.
+   *
+   * This is an exact decomposition for supercontrolled basis
+   * :math:`\sim U_d(\pi/4, b, 0)`, all b, and any target. No guarantees for
+   * non-supercontrolled basis.
+   */
+  [[nodiscard]] TwoQubitLocalUnitaryList decomp3Supercontrolled(
+      const decomposition::TwoQubitWeylDecomposition& target) const;
+
+  /**
+   * Calculate traces for a combination of the parameters of the canonical
+   * gates of the target and basis decompositions.
+   * This can be used to determine the smallest number of basis gates that are
+   * necessary to construct an equivalent to the canonical gate.
+   */
+  [[nodiscard]] std::array<std::complex<double>, 4>
+  traces(const decomposition::TwoQubitWeylDecomposition& target) const;
+
+  [[nodiscard]] static bool relativeEq(double lhs, double rhs, double epsilon,
+                                       double maxRelative);
+
+private:
+  // fidelity with which the basis gate decomposition has been calculated
+  double basisFidelity;
+  // cached decomposition for basis gate
+  decomposition::TwoQubitWeylDecomposition basisDecomposer;
+  // true if basis gate is super-controlled
+  bool isSuperControlled;
+
+  // pre-built components for decomposition with 3 basis gates
+  Matrix2x2 u0l;
+  Matrix2x2 u0r;
+  Matrix2x2 u1l;
+  Matrix2x2 u1ra;
+  Matrix2x2 u1rb;
+  Matrix2x2 u2la;
+  Matrix2x2 u2lb;
+  Matrix2x2 u2ra;
+  Matrix2x2 u2rb;
+  Matrix2x2 u3l;
+  Matrix2x2 u3r;
+
+  // pre-built components for decomposition with 2 basis gates
+  Matrix2x2 q0l;
+  Matrix2x2 q0r;
+  Matrix2x2 q1la;
+  Matrix2x2 q1lb;
+  Matrix2x2 q1ra;
+  Matrix2x2 q1rb;
+  Matrix2x2 q2l;
+  Matrix2x2 q2r;
+};
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h
index 8fb0018b28..5d02898c3b 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Euler.h
@@ -32,6 +32,7 @@ enum class EulerBasis : std::uint8_t {
   XYX = 3,  ///< `RX(phi) * RY(theta) * RX(lambda)`.
   U = 4,    ///< `U(theta, phi, lambda)`.
   ZSXX = 5, ///< `RZ` / `SX` / `X` synthesis via ZYZ decomposition.
+  R = 6,    ///< `R(.,0) * R(.,pi/2) * R(.,0)` (XYX with `Rx`/`Ry` as `R`).
 };
 
 /**
@@ -42,6 +43,31 @@ enum class EulerBasis : std::uint8_t {
  */
 [[nodiscard]] std::optional<EulerBasis> parseEulerBasis(StringRef basis);
 
+/**
+ * @brief Euler angles `(theta, phi, lambda)` and global phase for a 2x2
+ * unitary.
+ *
+ * The decomposition obeys `matrix == e^{i*phase} * K(phi) * A(theta) *
+ * K(lambda)` where `(K, A)` are the rotation axes of the chosen @ref
+ * EulerBasis.
+ */
+struct EulerAngles {
+  double theta = 0.0;  ///< Middle rotation angle.
+  double phi = 0.0;    ///< First outer rotation angle.
+  double lambda = 0.0; ///< Second outer rotation angle.
+  double phase = 0.0;  ///< Global phase in radians.
+};
+
+/**
+ * @brief Extracts `(theta, phi, lambda, phase)` of @p matrix in @p basis.
+ *
+ * @param matrix The single-qubit unitary to decompose.
+ * @param basis The target Euler basis.
+ * @return The extracted Euler angles and global phase.
+ */
+[[nodiscard]] EulerAngles anglesFromUnitary(const Matrix2x2& matrix,
+                                            EulerBasis basis);
+
 /**
  * @brief Synthesizes a composed single-qubit unitary as gates in @p basis.
  *
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h
new file mode 100644
index 0000000000..4d45854321
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <complex>
+
+/// Numeric helpers used by the decomposition passes.
+
+namespace mlir::qco::helpers {
+
+/// Check whether `matrix` is unitary within `tolerance` (i.e. `M^H M` is
+/// approximately the identity).
+[[nodiscard]] bool isUnitaryMatrix(const Matrix2x2& matrix,
+                                   double tolerance = 1e-12);
+
+/**
+ * Euclidean remainder of a modulo b.
+ * The returned value is never negative.
+ */
+[[nodiscard]] double remEuclid(double a, double b);
+
+/**
+ * Convert a two-qubit trace overlap into the average gate fidelity metric used
+ * by the decomposition cost code.
+ */
+[[nodiscard]] double traceToFidelity(const std::complex<double>& x);
+
+/**
+ * Return the scalar `e^(i * globalPhase)` factor for a stored global phase.
+ */
+[[nodiscard]] std::complex<double> globalPhaseFactor(double globalPhase);
+
+} // namespace mlir::qco::helpers
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h
new file mode 100644
index 0000000000..cb6fd56389
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/SmallVector.h>
+
+#include <cstddef>
+
+/// Standard-basis matrix factories for the decomposition layer. Two-qubit
+/// matrices use the same computational-basis index bit order as
+/// ``UnitaryOpInterface::getUnitaryMatrix4x4`` (qubit 0 labels the high bit).
+
+namespace mlir::qco::decomposition {
+
+/// Logical qubit index used by ``expandToTwoQubits`` /
+/// ``fixTwoQubitMatrixQubitOrder``.
+using QubitId = std::size_t;
+
+inline constexpr double FRAC1_SQRT2 =
+    0.707106781186547524400844362104849039284835937688474036588L;
+
+/// Generic 3-parameter single-qubit unitary `U(theta, phi, lambda)`.
+[[nodiscard]] Matrix2x2 uMatrix(double theta, double phi, double lambda);
+/// Axis rotations `exp(-i theta/2 * sigma_{x,y,z})`.
+[[nodiscard]] Matrix2x2 rxMatrix(double theta);
+[[nodiscard]] Matrix2x2 ryMatrix(double theta);
+[[nodiscard]] Matrix2x2 rzMatrix(double theta);
+/// Two-qubit Ising-style rotations on the `XX`, `YY`, `ZZ` generators.
+[[nodiscard]] Matrix4x4 rxxMatrix(double theta);
+[[nodiscard]] Matrix4x4 ryyMatrix(double theta);
+[[nodiscard]] Matrix4x4 rzzMatrix(double theta);
+/// Phase gate `diag(1, exp(i lambda))`.
+[[nodiscard]] Matrix2x2 pMatrix(double lambda);
+
+/// `SWAP` gate (4x4).
+[[nodiscard]] const Matrix4x4& swapGate();
+/// Hadamard gate (2x2).
+[[nodiscard]] const Matrix2x2& hGate();
+/// `i * sigma_z`; useful when factoring Pauli rotations out of a 2x2.
+[[nodiscard]] const Matrix2x2& ipz();
+/// `i * sigma_y`.
+[[nodiscard]] const Matrix2x2& ipy();
+/// `i * sigma_x`.
+[[nodiscard]] const Matrix2x2& ipx();
+
+/// CX entangler with control on qubit 0 (MSB) and target on qubit 1.
+[[nodiscard]] const Matrix4x4& cxGate01();
+/// CX entangler with control on qubit 1 and target on qubit 0 (MSB).
+[[nodiscard]] const Matrix4x4& cxGate10();
+/// CZ entangler (wire-order invariant).
+[[nodiscard]] const Matrix4x4& czGate();
+
+/// Kronecker-embed a 2x2 on wire ``qubitId`` (identity on the other wire).
+[[nodiscard]] Matrix4x4 expandToTwoQubits(const Matrix2x2& singleQubitMatrix,
+                                          QubitId qubitId);
+
+/// Reorder a 4x4 two-qubit matrix so its qubits match the canonical
+/// `(low, high)` order given the operand-order `qubitIds`. No-op when the
+/// operand order already matches.
+[[nodiscard]] Matrix4x4
+fixTwoQubitMatrixQubitOrder(const Matrix4x4& twoQubitMatrix,
+                            const llvm::SmallVector<QubitId, 2>& qubitIds);
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h
new file mode 100644
index 0000000000..7f9bda7a30
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <array>
+#include <complex>
+#include <cstdint>
+#include <optional>
+#include <tuple>
+#include <utility>
+
+namespace mlir::qco::decomposition {
+/**
+ * Allowed deviation for internal assert statements which ensure the correctness
+ * of the decompositions.
+ */
+constexpr double SANITY_CHECK_PRECISION = 1e-12;
+
+/**
+ * Weyl decomposition of a 2-qubit unitary matrix (4x4).
+ * The result consists of four 2x2 1-qubit matrices (k1l, k2l, k1r, k2r) and
+ * three parameters for a canonical gate (a, b, c). The matrices can then be
+ * decomposed using a single-qubit decomposition into e.g. rotation gates and
+ * the canonical gate is RXX(-2 * a), RYY(-2 * b), RZZ(-2 * c).
+ *
+ * @note Adapted from TwoQubitWeylDecomposition in the IBM Qiskit framework.
+ *       (C) Copyright IBM 2023
+ *
+ *       This code is licensed under the Apache License, Version 2.0. You may
+ *       obtain a copy of this license in the LICENSE.txt file in the root
+ *       directory of this source tree or at
+ *       https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ *       Any modifications or derivative works of this code must retain this
+ *       copyright notice, and modified files need to carry a notice
+ *       indicating that they have been altered from the originals.
+ */
+class TwoQubitWeylDecomposition {
+public:
+  /**
+   * Create Weyl decomposition.
+   *
+   * @param unitaryMatrix Matrix of the two-qubit operation/series to be
+   *                      decomposed.
+   * @param fidelity Tolerance to assume a specialization which is used to
+   *                 reduce the number of parameters required by the canonical
+   *                 gate and thus potentially decreasing the number of basis
+   *                 gates.
+   */
+  static TwoQubitWeylDecomposition create(const Matrix4x4& unitaryMatrix,
+                                          std::optional<double> fidelity);
+
+  ~TwoQubitWeylDecomposition() = default;
+  TwoQubitWeylDecomposition(const TwoQubitWeylDecomposition&) = default;
+  TwoQubitWeylDecomposition(TwoQubitWeylDecomposition&&) = default;
+  TwoQubitWeylDecomposition&
+  operator=(const TwoQubitWeylDecomposition&) = default;
+  TwoQubitWeylDecomposition& operator=(TwoQubitWeylDecomposition&&) = default;
+
+  /**
+   * Calculate matrix of canonical gate based on its parameters a, b, c.
+   */
+  [[nodiscard]] Matrix4x4 getCanonicalMatrix() const {
+    return getCanonicalMatrix(a_, b_, c_);
+  }
+
+  /**
+   * First parameter of canonical gate.
+   *
+   * @note must be multiplied by -2.0 for rotation angle of RXX gate
+   */
+  [[nodiscard]] double a() const { return a_; }
+  /**
+   * Second parameter of canonical gate.
+   *
+   * @note must be multiplied by -2.0 for rotation angle of RYY gate
+   */
+  [[nodiscard]] double b() const { return b_; }
+  /**
+   * Third parameter of canonical gate.
+   *
+   * @note must be multiplied by -2.0 for rotation angle of RZZ gate
+   */
+  [[nodiscard]] double c() const { return c_; }
+  /**
+   * Necessary global phase adjustment after applying decomposition.
+   */
+  [[nodiscard]] double globalPhase() const { return globalPhase_; }
+
+  /**
+   * "Left" qubit after canonical gate.
+   *
+   * q1 - k2r - C -  k1r  -
+   *            A
+   * q0 - k2l - N - *k1l* -
+   */
+  [[nodiscard]] const Matrix2x2& k1l() const { return k1l_; }
+  /**
+   * "Left" qubit before canonical gate.
+   *
+   * q1 -  k2r  - C - k1r -
+   *              A
+   * q0 - *k2l* - N - k1l -
+   */
+  [[nodiscard]] const Matrix2x2& k2l() const { return k2l_; }
+  /**
+   * "Right" qubit after canonical gate.
+   *
+   * q1 - k2r - C - *k1r* -
+   *            A
+   * q0 - k2l - N -  k1l  -
+   */
+  [[nodiscard]] const Matrix2x2& k1r() const { return k1r_; }
+  /**
+   * "Right" qubit before canonical gate.
+   *
+   * q1 - *k2r* - C - k1r -
+   *              A
+   * q0 -  k2l  - N - k1l -
+   */
+  [[nodiscard]] const Matrix2x2& k2r() const { return k2r_; }
+
+  /**
+   * Calculate matrix of canonical gate based on given parameters a, b, c.
+   */
+  [[nodiscard]] static Matrix4x4 getCanonicalMatrix(double a, double b,
+                                                    double c);
+
+protected:
+  enum class Specialization : std::uint8_t {
+    General,               // canonical gate has no special symmetry.
+    IdEquiv,               // canonical gate is identity.
+    SWAPEquiv,             // canonical gate is SWAP.
+    PartialSWAPEquiv,      // canonical gate is partial SWAP.
+    PartialSWAPFlipEquiv,  // canonical gate is flipped partial SWAP.
+    ControlledEquiv,       // canonical gate is a controlled gate.
+    MirrorControlledEquiv, // canonical gate is swap + controlled gate.
+
+    // These next 3 gates use the definition of fSim from eq (1) in:
+    // https://arxiv.org/pdf/2001.08343.pdf
+    FSimaabEquiv,  // parameters a=b & a!=c
+    FSimabbEquiv,  // parameters a!=b & b=c
+    FSimabmbEquiv, // parameters a!=b!=c & -b=c
+  };
+
+  enum class MagicBasisTransform : std::uint8_t {
+    Into,
+    OutOf,
+  };
+
+  /**
+   * Threshold for imprecision in computation of diagonalization.
+   */
+  static constexpr auto DIAGONALIZATION_PRECISION = 1e-13;
+
+  TwoQubitWeylDecomposition() = default;
+
+  [[nodiscard]] static Matrix4x4
+  magicBasisTransform(const Matrix4x4& unitary, MagicBasisTransform direction);
+
+  [[nodiscard]] static double closestPartialSwap(double a, double b, double c);
+
+  /**
+   * Diagonalize given complex symmetric matrix M into (P, d) using a
+   * randomized algorithm.
+   * This approach is used in both qiskit and quantumflow.
+   *
+   * P is the matrix of real or orthogonal eigenvectors of M with P in SO(4).
+   * d is a vector containing sqrt(eigenvalues) of M with unit-magnitude
+   * elements (for each element, complex magnitude is 1.0).
+   * D is d as a diagonal matrix.
+   *
+   * M = P * D * P^T
+   *
+   * @return pair of (P, D.diagonal())
+   */
+  [[nodiscard]] static std::pair<Matrix4x4, std::array<Complex, 4>>
+  diagonalizeComplexSymmetric(const Matrix4x4& m, double precision);
+
+  /**
+   * Decompose a special unitary matrix C that is the combination of two
+   * single-qubit gates A and B into its single-qubit matrices.
+   *
+   * C = A ⊗ B
+   *
+   * @param specialUnitary Special unitary matrix C
+   *
+   * @return single-qubit matrices A and B and the required
+   *         global phase adjustment
+   */
+  static std::tuple<Matrix2x2, Matrix2x2, double>
+  decomposeTwoQubitProductGate(const Matrix4x4& specialUnitary);
+
+  /**
+   * Calculate trace of two sets of parameters for the canonical gate.
+   * The trace has been defined in: https://arxiv.org/abs/1811.12926
+   */
+  [[nodiscard]] static std::complex<double>
+  getTrace(double a, double b, double c, double ap, double bp, double cp);
+
+  /**
+   * Choose the best specialization for the canonical gate.
+   * This will use the requestedFidelity to determine if a specialization is
+   * close enough to the actual canonical gate matrix.
+   */
+  [[nodiscard]] Specialization bestSpecialization() const;
+
+  /**
+   * @return true if the specialization flipped the original decomposition
+   */
+  bool applySpecialization();
+
+private:
+  // Canonical gate parameters `(a, b, c)`; documented on the public accessors.
+  double a_{};
+  double b_{};
+  double c_{};
+  double globalPhase_{};
+  // Single-qubit factors surrounding the canonical gate; see the accessors
+  // for the per-field wiring diagram.
+  Matrix2x2 k1l_;
+  Matrix2x2 k2l_;
+  Matrix2x2 k1r_;
+  Matrix2x2 k2r_;
+  Specialization specialization{Specialization::General};
+  /// Optional `traceToFidelity` floor for specialization; unset disables it.
+  std::optional<double> requestedFidelity;
+};
+} // namespace mlir::qco::decomposition
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.h
new file mode 100644
index 0000000000..a09f980139
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+/// \file
+/// Fuse maximal two-qubit unitary windows (with absorbed single-qubit padding).
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Support/LogicalResult.h>
+
+namespace mlir::qco::native_synth {
+
+/// Scan `root` for maximal two-qubit windows (including absorbed single-qubit
+/// ops on the same wire pair) and replace each window when Weyl/KAK
+/// resynthesis to the native profile is profitable.
+LogicalResult fuseTwoQubitUnitaryRuns(IRRewriter& rewriter, Operation* root,
+                                      const NativeProfileSpec& spec);
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h
new file mode 100644
index 0000000000..4993a85758
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+
+#include <llvm/ADT/StringRef.h>
+
+#include <optional>
+
+namespace mlir::qco::native_synth {
+
+/// Euler basis used to synthesize an arbitrary single-qubit unitary into the
+/// gates emitted by `emitter`. This is the deterministic replacement for the
+/// scored multi-basis search.
+[[nodiscard]] decomposition::EulerBasis
+emitterEulerBasis(const SingleQubitEmitterSpec& emitter);
+
+/// Resolve a comma-separated native gate menu (e.g. `"x,sx,rz,cx"`) into a
+/// full `NativeProfileSpec`.
+///
+/// Parses the pass `native-gates` string into a `NativeProfileSpec`
+/// (single-qubit emitters, entangler bases, and `allowedGates`). Token set
+/// matches `Passes.td` on this pass.
+///
+/// Recognised tokens: `u`, `x`, `sx`, `rz` (or `p`), `rx`, `ry`, `r`,
+/// `cx`, `cz`, `rzz`.
+std::optional<NativeProfileSpec>
+resolveNativeGatesSpec(llvm::StringRef nativeGates);
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h
new file mode 100644
index 0000000000..7b7b00fb9b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+
+#include <mlir/IR/Operation.h>
+
+/// Menu membership checks for native synthesis (no IR rewrites).
+
+namespace mlir::qco::native_synth {
+
+/// Whether the menu contains the corresponding two-qubit entangler. Used by
+/// the 2q rewrite path to pick between CX and CZ emission.
+bool usesCxEntangler(const NativeProfileSpec& spec);
+bool usesCzEntangler(const NativeProfileSpec& spec);
+
+/// Whether an already-lowered single-qubit op is in the menu (i.e. no
+/// further rewrite needed).
+bool allowsSingleQubitOp(UnitaryOpInterface op, const NativeProfileSpec& spec);
+
+/// Whether `op` has a direct (non-matrix) lowering via the corresponding
+/// `decomposeTo*` helper in `SingleQubit.h`. These are used for ops whose
+/// angles are not compile-time constants, so no constant ``2×2`` matrix is
+/// available for the matrix-driven path.
+bool canDirectlyDecomposeToZSXX(Operation* op, bool supportsDirectRx);
+bool canDirectlyDecomposeToU3(Operation* op);
+bool canDirectlyDecomposeToR(Operation* op);
+bool canDirectlyDecomposeToAxisPair(Operation* op, AxisPair axisPair);
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.h
new file mode 100644
index 0000000000..b09c48b4dd
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+/// \file
+/// Single-qubit native-synthesis lowering helpers.
+/// Covers symbolic `decomposeTo*` rewrites (used for dynamic-angle ops) plus
+/// the matrix-driven `emitSingleQubitMatrix` synthesizer that lowers any
+/// constant ``2×2`` unitary via the shared `Euler.h` synthesis.
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+
+namespace mlir::qco::native_synth {
+
+/// Direct (non-matrix) single-qubit lowering to the `ZSXX` emitter
+/// (`{Rz, Sx, X}`). Returns the output qubit value, or a null `Value` if no
+/// direct rule applies and a matrix-based fallback must be tried.
+///
+/// When `supportsDirectRx` is true, the emitter also passes `Rx` through
+/// unchanged and lowers `Ry` / `R` via an `rz * rx * rz` sandwich.
+Value decomposeToZSXX(IRRewriter& rewriter, Operation* op, Value inQubit,
+                      bool supportsDirectRx);
+
+/// Direct (non-matrix) single-qubit lowering to a `U(theta, phi, lambda)`
+/// output. Returns the output qubit value, or a null `Value` if no direct
+/// rule applies and a matrix-based fallback must be tried.
+Value decomposeToU3(IRRewriter& rewriter, Operation* op, Value inQubit);
+
+/// Direct (non-matrix) single-qubit lowering to the `R(theta, phi)` emitter.
+/// Returns the output qubit value, or a null `Value` if no direct rule
+/// applies and a matrix-based fallback must be tried.
+Value decomposeToR(IRRewriter& rewriter, Operation* op, Value inQubit);
+
+/// Direct (non-matrix) single-qubit lowering to a two-axis emitter
+/// identified by `axisPair` (e.g. `{Rx, Rz}`, `{Ry, Rz}`). Returns the
+/// output qubit value, or a null `Value` if no direct rule applies and a
+/// matrix-based fallback must be tried.
+Value decomposeToAxisPair(IRRewriter& rewriter, Operation* op, Value inQubit,
+                          AxisPair axisPair);
+
+/// Synthesize a constant ``2×2`` unitary `matrix` into native gates of `basis`
+/// (including a `qco.gphase` when the residual phase is non-trivial) and
+/// return the resulting output qubit. Wraps `decomposition::Euler`.
+Value emitSingleQubitMatrix(IRRewriter& rewriter, Location loc, Value inQubit,
+                            const Matrix2x2& matrix,
+                            decomposition::EulerBasis basis);
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.h
new file mode 100644
index 0000000000..cd90b21861
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cstdint>
+
+/// Deterministic two-qubit lowering: Weyl decomposition + the
+/// `TwoQubitBasisDecomposer` with a fixed entangler (CX before CZ) and the
+/// first emitter's Euler basis for the surrounding single-qubit factors.
+
+namespace mlir::qco::native_synth {
+
+/// Number of entanglers (basis-gate uses) the minimal KAK decomposition of
+/// `target` requires for the entangler selected by `spec` (CX before CZ).
+/// Returns `std::nullopt` when `spec` has no usable entangler basis.
+std::optional<std::uint8_t>
+twoQubitEntanglerCount(const Matrix4x4& target, const NativeProfileSpec& spec);
+
+/// Synthesize the two-qubit unitary `target` (raw `4×4`, any global phase) at
+/// `(qubit0, qubit1)` into native entanglers and single-qubit gates of `spec`.
+/// The entangler is chosen deterministically (CX before CZ) and the
+/// single-qubit factors use the first emitter's Euler basis. Writes the output
+/// qubit values to `outQubit0` / `outQubit1`.
+///
+/// Returns `failure()` when the profile has no usable entangler basis or the
+/// KAK decomposition is not realizable with that entangler.
+LogicalResult emitTwoQubitNative(IRRewriter& rewriter, Location loc,
+                                 Value qubit0, Value qubit1,
+                                 const Matrix4x4& target,
+                                 const NativeProfileSpec& spec,
+                                 Value& outQubit0, Value& outQubit1);
+
+/// Rewrite `XXPlusYY` / `XXMinusYY` via two `RZZ` blocks (menus with `rzz`).
+LogicalResult rewriteXXPlusMinusYYViaRzz(IRRewriter& rewriter, Operation* op);
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h
new file mode 100644
index 0000000000..cd32596bb9
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <cstdint>
+
+/// Types for native gate synthesis: the resolved menu and its emitters.
+
+namespace mlir::qco::native_synth {
+
+/// Two-axis token pairs (`rx`+`rz`, `rx`+`ry`, `ry`+`rz`) that can be selected
+/// as the single-qubit menu in a `NativeProfileSpec`.
+enum class AxisPair : std::uint8_t { RxRz, RxRy, RyRz };
+
+/// Single-qubit emission strategy.
+enum class SingleQubitMode : std::uint8_t {
+  /// Emit `{X, Sx, Rz}` via the ZSXX Euler decomposition. When the spec's
+  /// `supportsDirectRx` is set, the emitter additionally passes Rx through
+  /// unchanged and expands Ry / R via an `rz * rx * rz` sandwich.
+  ZSXX,
+  /// Emit a single `u(theta, phi, lambda)` op.
+  U3,
+  /// Emit `R(theta, phi)` via the XYX Euler decomposition.
+  R,
+  /// Emit one of the three two-axis rotation pairs selected by `axisPair`.
+  AxisPair,
+};
+
+/// Two-qubit entangling basis selected by a profile.
+enum class EntanglerBasis : std::uint8_t { None, Cx, Cz };
+
+/// Profile-level classification of a native gate. Used both to describe the
+/// menu (`NativeProfileSpec::allowedGates`) and to classify already-lowered
+/// output ops in policy checks.
+enum class NativeGateKind : std::uint8_t {
+  U,
+  X,
+  Sx,
+  Rz,
+  Rx,
+  Ry,
+  R,
+  Cx,
+  Cz,
+  Rzz,
+};
+
+/// Single-qubit emitter specification: the target mode plus any modifiers
+/// (axis pair, whether direct Rx emission is permitted).
+struct SingleQubitEmitterSpec {
+  SingleQubitMode mode = SingleQubitMode::U3;
+  AxisPair axisPair = AxisPair::RxRz;
+  /// Only meaningful for `SingleQubitMode::ZSXX`: when set, the emitter may
+  /// emit Rx / Ry / R directly (via an `rz * rx * rz` sandwich for the latter
+  /// two) instead of falling back to the ZSXX Euler sequence.
+  bool supportsDirectRx = false;
+};
+
+/// Resolved menu: emitters to try for 1q synthesis and entangler bases for 2q.
+/// Built by `resolveNativeGatesSpec`. Single-qubit synthesis is deterministic:
+/// the first emitter is preferred and its Euler basis drives matrix synthesis.
+struct NativeProfileSpec {
+  bool allowRzz = false;
+  llvm::DenseSet<NativeGateKind> allowedGates;
+  llvm::SmallVector<SingleQubitEmitterSpec> singleQubitEmitters;
+  llvm::SmallVector<EntanglerBasis> entanglerBases;
+};
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h
new file mode 100644
index 0000000000..fbf70131b8
--- /dev/null
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+
+#include <optional>
+
+/// F64 helpers and block unitary extraction for native gate synthesis.
+
+namespace mlir::qco::native_synth {
+
+/// Create an ``arith.constant`` F64.
+Value createF64Const(IRRewriter& rewriter, Location loc, double value);
+
+/// If ``value`` is an F64 ``arith.constant``, return its value.
+std::optional<double> getConstantF64(Value value);
+
+/// Emit a `qco.gphase` if `phase` is non-negligible.
+void emitGPhaseIfNonTrivial(IRRewriter& rewriter, Location loc, double phase);
+
+/// 4x4 for a 2q block member (plain 2q, ``CtrlOp`` CX/CZ, or lifted 1q). Fails
+/// for barriers, ``gphase``, multi-control, or non-constant matrix parameters.
+bool getBlockTwoQubitMatrix(Operation* op, Matrix4x4& matrix);
+
+/// Pre-order walk: every op implementing `UnitaryOpInterface` under `root`,
+/// excluding bodies nested under `ctrl` / `inv`.
+void collectUnitaryOpsInPreOrder(Operation* root,
+                                 llvm::SmallVectorImpl<Operation*>& ops);
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h
index 7444438a88..39289a1062 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.h
@@ -14,6 +14,8 @@
 #include <mlir/Pass/Pass.h>
 #include <mlir/Pass/PassRegistry.h>
 
+#include <string>
+
 namespace mlir::qco {
 
 #define GEN_PASS_DECL
@@ -27,4 +29,15 @@ namespace mlir::qco {
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/QCO/Transforms/Passes.h.inc" // IWYU pragma: export
 
+/// Options for the native gate synthesis pass.
+///
+/// @p nativeGates is a comma-separated list of gate tokens (see `Passes.td`
+/// for recognised tokens).
+struct NativeGateSynthesisOptions {
+  std::string nativeGates;
+};
+
+std::unique_ptr<Pass>
+createNativeGateSynthesisPass(const NativeGateSynthesisOptions& options);
+
 } // namespace mlir::qco
diff --git a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
index fafb906a77..0b66c68578 100644
--- a/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/QCO/Transforms/Passes.td
@@ -66,6 +66,34 @@ def FuseSingleQubitUnitaryRuns
                         "Target Euler basis (zyz, zxz, xzx, xyx, u, zsxx).">];
 }
 
+def FuseTwoQubitUnitaryRuns
+    : Pass<"fuse-two-qubit-unitary-runs", "mlir::ModuleOp"> {
+  let dependentDialects = ["mlir::qco::QCODialect"];
+  let summary = "Fuse two-qubit unitary runs using Weyl/KAK resynthesis";
+  let description = [{
+    Scans the module for maximal two-qubit windows: contiguous sequences of
+    two-qubit unitaries on the same wire pair, with single-qubit gates on those
+    wires absorbed into the window's accumulated `4×4` unitary when they have
+    a single use. Each window with at least two ops is replaced when beneficial:
+    when the window contains any gate outside the `native-gates` menu, or when
+    deterministic Weyl/KAK resynthesis to that menu uses strictly fewer
+    entanglers than the window already contains.
+
+    The `native-gates` option uses the same comma-separated token list as
+    `native-gate-synthesis` (e.g. `u,cx`, `x,sx,rz,cx`). An empty or
+    whitespace-only menu is a no-op. An unrecognised token causes the pass to
+    fail.
+
+    Barriers, global phase, fan-out, and ops on more than two qubits close
+    open windows. Bodies nested under `qco.ctrl` or `qco.inv` are not tracked
+    independently.
+  }];
+  let options = [Option<
+      "nativeGates", "native-gates", "std::string", "\"\"",
+      "Comma-separated native gate menu. Empty or whitespace-only is "
+      "a no-op. Tokens: u, x, sx, rz (or p), rx, ry, r, cx, cz, rzz.">];
+}
+
 def QuantumLoopUnroll
     : InterfacePass<"quantum-loop-unroll", "FunctionOpInterface"> {
   let dependentDialects = ["mlir::qco::QCODialect", "mlir::scf::SCFDialect"];
@@ -143,6 +171,61 @@ def MappingPass : Pass<"place-and-route", "mlir::ModuleOp"> {
                               "The number of inserted SWAPs">];
 }
 
+//===----------------------------------------------------------------------===//
+// Native gate synthesis
+//===----------------------------------------------------------------------===//
+
+def NativeGateSynthesisPass : Pass<"native-gate-synthesis", "mlir::ModuleOp"> {
+  let dependentDialects = ["mlir::qco::QCODialect"];
+  let summary = "Lower QCO unitary gates to a user-specified native gate menu.";
+  let description = [{
+    This pass rewrites a module so that every remaining unitary operation is
+    allowed by the `native-gates` menu. `qco.barrier` and `qco.gphase` are
+    preserved; controlled gates (`qco.ctrl`) must have a single control and a
+    single target.
+
+    The menu is a comma-separated list of gate tokens (order not significant)
+    from which the pass builds a profile: a single-qubit synthesis mode
+    (generic `qco.u` when `u` is present; IBM-style surface gates when all of
+    `x`, `sx`, and `rz`/`p` are present; IQM-style `qco.r` when `r` is present;
+    or a supported rotation pair chosen from `rx`, `ry`, `rz`) plus optional
+    two-qubit entanglers `cx`, `cz`, and optional `rzz`.
+
+    Recognised tokens: `u`, `x`, `sx`, `rz` (or `p`), `rx`, `ry`, `r`, `cx`,
+    `cz`, `rzz`. An empty or whitespace-only menu is a no-op, which is the
+    intended pipeline default when synthesis is not needed. An unrecognised
+    token causes the pass to fail.
+
+    Example menus (each line is one illustrative menu; pick either `cx` or
+    `cz` as the entangler, or list both if both are native):
+    - IBM basic (no fractional two-qubit): `x,sx,rz,cx` or `x,sx,rz,cz`
+    - IBM fractional: `x,sx,rz,rx,rzz,cx` or `x,sx,rz,rx,rzz,cz`
+    - Generic single-qubit U: `u,cx` or `u,cz`
+    - IQM default: `r,cz` (or `r,cx` if CX is the native entangler)
+    - Rotation pair + entangler: `rx,rz,cx`, `rx,ry,cz`, `ry,rz,cx`, etc.
+      Supported pairs are exactly `rx`+`rz`, `rx`+`ry`, and `ry`+`rz`.
+
+    Execution order (mirrors the implementation): fuse consecutive
+    single-qubit runs; fuse two-qubit windows (including absorbed
+    single-qubit padding) via `fuse-two-qubit-unitary-runs`; run up to four synthesis sweeps over remaining
+    non-native unitaries until every single-qubit op matches the menu (two-qubit
+    lowering may temporarily emit off-menu 1q ops that later sweeps absorb—if
+    any remain after that cap, the pass fails); fuse 1q seams between two-qubit
+    blocks; then up to four further synthesis + fusion rounds until the full menu
+    holds (including native `qco.ctrl` shells and bare `rzz` when allowed). If
+    anything is still off-menu, the pass fails.
+
+    Lowering is deterministic: the entangler is chosen as `cx` before `cz`, the
+    single-qubit factors use the first emitter's Euler basis, and the minimal
+    KAK entangler count drives two-qubit window replacement.
+  }];
+  let options = [Option<
+      "nativeGates", "native-gates", "std::string", "\"\"",
+      "Comma-separated native gate menu. Empty or whitespace-only is "
+      "a no-op. Tokens: u, x, sx, rz (or p), rx, ry, r, cx, cz, rzz. "
+      "Examples: x,sx,rz,cx; x,sx,rz,rx,rzz,cz; u,cx; r,cz; rx,rz,cx.">];
+}
+
 //===----------------------------------------------------------------------===//
 // Optimization Passes
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h b/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h
index 8f49d372d7..1ac9b8f8c3 100644
--- a/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h
+++ b/mlir/include/mlir/Dialect/QCO/Utils/Matrix.h
@@ -183,6 +183,12 @@ struct Matrix2x2 {
    */
   [[nodiscard]] Matrix2x2 adjoint() const;
 
+  /**
+   * @brief Returns the (non-conjugate) transpose of this matrix.
+   * @return Transposed matrix `A^T`.
+   */
+  [[nodiscard]] Matrix2x2 transpose() const;
+
   /**
    * @brief Returns the trace of this matrix.
    * @return Sum of diagonal entries.
@@ -195,6 +201,13 @@ struct Matrix2x2 {
    */
   [[nodiscard]] Complex determinant() const;
 
+  /**
+   * @brief Checks whether this matrix is approximately the identity.
+   * @param tol Maximum allowed complex modulus of each entry difference.
+   * @return True if every entry is within @p tol of the identity.
+   */
+  [[nodiscard]] bool isIdentity(double tol = MATRIX_TOLERANCE) const;
+
   /**
    * @brief Checks approximate equality using an absolute entry-wise tolerance.
    *
@@ -322,6 +335,12 @@ struct Matrix4x4 {
    */
   [[nodiscard]] Matrix4x4 adjoint() const;
 
+  /**
+   * @brief Returns the (non-conjugate) transpose of this matrix.
+   * @return Transposed matrix `A^T`.
+   */
+  [[nodiscard]] Matrix4x4 transpose() const;
+
   /**
    * @brief Returns the trace of this matrix.
    * @return Sum of diagonal entries.
@@ -334,6 +353,53 @@ struct Matrix4x4 {
    */
   [[nodiscard]] Complex determinant() const;
 
+  /**
+   * @brief Checks whether this matrix is approximately the identity.
+   * @param tol Maximum allowed complex modulus of each entry difference.
+   * @return True if every entry is within @p tol of the identity.
+   */
+  [[nodiscard]] bool isIdentity(double tol = MATRIX_TOLERANCE) const;
+
+  /**
+   * @brief Returns the four diagonal entries `(m00, m11, m22, m33)`.
+   * @return Array of diagonal entries.
+   */
+  [[nodiscard]] std::array<Complex, K_ROWS> diagonal() const;
+
+  /**
+   * @brief Builds a diagonal matrix from four diagonal entries.
+   * @param diagonalEntries Diagonal entries `(m00, m11, m22, m33)`.
+   * @return Diagonal matrix with the given entries.
+   */
+  [[nodiscard]] static Matrix4x4
+  fromDiagonal(const std::array<Complex, K_ROWS>& diagonalEntries);
+
+  /**
+   * @brief Returns the entries of column @p col, top to bottom.
+   * @param col Column index in `[0, K_COLS)`.
+   * @return Array of the four column entries.
+   */
+  [[nodiscard]] std::array<Complex, K_ROWS> column(std::size_t col) const;
+
+  /**
+   * @brief Overwrites column @p col with @p values.
+   * @param col Column index in `[0, K_COLS)`.
+   * @param values New column entries, top to bottom.
+   */
+  void setColumn(std::size_t col, const std::array<Complex, K_ROWS>& values);
+
+  /**
+   * @brief Returns the element-wise real parts in row-major order.
+   * @return Real parts of all entries.
+   */
+  [[nodiscard]] std::array<double, K_SIZE_AT_COMPILE_TIME> realPart() const;
+
+  /**
+   * @brief Returns the element-wise imaginary parts in row-major order.
+   * @return Imaginary parts of all entries.
+   */
+  [[nodiscard]] std::array<double, K_SIZE_AT_COMPILE_TIME> imagPart() const;
+
   /**
    * @brief Checks approximate equality using an absolute entry-wise tolerance.
    *
@@ -558,4 +624,54 @@ inline constexpr bool
     std::disjunction_v<std::is_same<T, Matrix1x1>, std::is_same<T, Matrix2x2>,
                        std::is_same<T, Matrix4x4>,
                        std::is_same<T, DynamicMatrix>>;
+
+/**
+ * @brief Kronecker product `lhs (x) rhs` of two single-qubit matrices.
+ *
+ * Uses the computational-basis bit order where the first operand labels the
+ * high bit, matching `UnitaryOpInterface::getUnitaryMatrix4x4`.
+ *
+ * @param lhs Left factor (acts on the high bit / qubit 0).
+ * @param rhs Right factor (acts on the low bit / qubit 1).
+ * @return The `4x4` Kronecker product.
+ */
+[[nodiscard]] Matrix4x4 kron(const Matrix2x2& lhs, const Matrix2x2& rhs);
+
+/// Scalar-on-the-left multiply `scalar * matrix` (commutes with the member
+/// `matrix * scalar`). Provided so generic code can scale a matrix from
+/// either side.
+[[nodiscard]] Matrix2x2 operator*(const Complex& scalar,
+                                  const Matrix2x2& matrix);
+/// @copydoc operator*(const Complex&, const Matrix2x2&)
+[[nodiscard]] Matrix4x4 operator*(const Complex& scalar,
+                                  const Matrix4x4& matrix);
+
+/**
+ * @brief Eigenvalues and eigenvectors of a real symmetric `4x4` matrix.
+ *
+ * `eigenvalues` are sorted ascending and `eigenvectors` holds the
+ * corresponding orthonormal eigenvectors as columns (column `j` is the
+ * eigenvector for `eigenvalues[j]`), matching the convention of
+ * `Eigen::SelfAdjointEigenSolver`.
+ */
+struct SymmetricEigen4 {
+  /// Eigenvalues sorted in ascending order.
+  std::array<double, 4> eigenvalues{};
+  /// Orthonormal eigenvectors as columns (real-valued, zero imaginary part).
+  Matrix4x4 eigenvectors{};
+};
+
+/**
+ * @brief Computes the eigendecomposition of a real symmetric `4x4` matrix.
+ *
+ * Implements the cyclic Jacobi eigenvalue algorithm, which is numerically
+ * robust for small symmetric matrices and yields orthonormal eigenvectors
+ * even for degenerate spectra.
+ *
+ * @param symmetric Row-major real symmetric `4x4` matrix.
+ * @return Ascending eigenvalues and matching eigenvectors (as columns).
+ */
+[[nodiscard]] SymmetricEigen4
+jacobiSymmetricEigen(const std::array<double, 16>& symmetric);
+
 } // namespace mlir::qco
diff --git a/mlir/lib/Compiler/CompilerPipeline.cpp b/mlir/lib/Compiler/CompilerPipeline.cpp
index 821ccda2ee..40846f2ef9 100644
--- a/mlir/lib/Compiler/CompilerPipeline.cpp
+++ b/mlir/lib/Compiler/CompilerPipeline.cpp
@@ -90,7 +90,7 @@ QuantumCompilerPipeline::runPipeline(ModuleOp module,
   // 2. QC cleanup
   // 3. QC-to-QCO conversion
   // 4. QCO cleanup
-  // 5. Optimization passes
+  // 5. Optimization and Native Gate Synthesis
   // 6. QCO cleanup
   // 7. QCO-to-QC conversion
   // 8. QC cleanup
@@ -145,7 +145,7 @@ QuantumCompilerPipeline::runPipeline(ModuleOp module,
                        totalStages);
     }
   }
-  // Stage 5: Optimization passes
+  // Stage 5: Optimization and native gate synthesis
   if (failed(runStage([&](PassManager& pm) {
         if (!config_.disableMergeSingleQubitRotationGates) {
           pm.addPass(qco::createMergeSingleQubitRotationGates());
@@ -153,14 +153,18 @@ QuantumCompilerPipeline::runPipeline(ModuleOp module,
         if (config_.enableHadamardLifting) {
           pm.addPass(qco::createHadamardLifting());
         }
+        pm.addPass(
+            qco::createNativeGateSynthesisPass(qco::NativeGateSynthesisOptions{
+                .nativeGates = config_.nativeGates,
+            }));
       }))) {
     return failure();
   }
   if (record != nullptr && config_.recordIntermediates) {
     record->afterOptimization = captureIR(module);
     if (config_.printIRAfterAllStages) {
-      prettyPrintStage(module, "Optimization Passes", ++currentStage,
-                       totalStages);
+      prettyPrintStage(module, "Optimization and Native Gate Synthesis",
+                       ++currentStage, totalStages);
     }
   }
   // Stage 6: QCO cleanup
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp
index 1b742170fa..1baee612d9 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/ROp.cpp
@@ -115,9 +115,14 @@ std::optional<Matrix2x2> ROp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto thetaSin = std::sin(*theta / 2);
-  const auto m01 = std::polar(thetaSin, -*phi - (std::numbers::pi / 2));
-  const auto m10 = std::polar(thetaSin, *phi - (std::numbers::pi / 2));
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `sin(theta / 2)` is negative for negative `theta`. Build the
+  // phased entries via `sin * e^{i*phi}` instead, which is well-defined for any
+  // sign of the magnitude.
+  const auto m01 = thetaSin * std::exp(1i * (-*phi - (std::numbers::pi / 2)));
+  const auto m10 = thetaSin * std::exp(1i * (*phi - (std::numbers::pi / 2)));
   const auto thetaCos = std::cos(*theta / 2);
   return Matrix2x2::fromElements(thetaCos, m01,  // row 0
                                  m10, thetaCos); // row 1
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp
index f504ba3a49..4cbe633883 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/UOp.cpp
@@ -133,12 +133,16 @@ std::optional<Matrix2x2> UOp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto c = std::cos(*theta / 2);
   const auto s = std::sin(*theta / 2);
 
-  const auto m01 = std::polar(s, *lambda + std::numbers::pi);
-  const auto m10 = std::polar(s, *phi);
-  const auto m11 = std::polar(c, *phi + *lambda);
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `sin`/`cos` of `theta / 2` can be negative. Build the phased
+  // entries via `mag * e^{i*phi}` instead, which is well-defined for any sign.
+  const auto m01 = s * std::exp(1i * (*lambda + std::numbers::pi));
+  const auto m10 = s * std::exp(1i * (*phi));
+  const auto m11 = c * std::exp(1i * (*phi + *lambda));
   return Matrix2x2::fromElements(c, m01,    // row 0
                                  m10, m11); // row 1
 }
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp
index c75c45f26e..24e5fbf8c5 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXMinusYYOp.cpp
@@ -79,10 +79,15 @@ std::optional<Matrix4x4> XXMinusYYOp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto mc = std::cos(*theta / 2);
   const auto s = std::sin(*theta / 2);
-  const auto msp = std::polar(s, *beta - (std::numbers::pi / 2));
-  const auto msm = std::polar(s, -*beta - (std::numbers::pi / 2));
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `s = sin(theta / 2)` is negative for negative `theta`. Build the
+  // phased entries via `s * e^{i*phi}` instead, which is well-defined for any
+  // sign of `s`.
+  const auto msp = s * std::exp(1i * (*beta - (std::numbers::pi / 2)));
+  const auto msm = s * std::exp(1i * (-*beta - (std::numbers::pi / 2)));
   return Matrix4x4::fromElements(mc, 0, 0, msm,  // row 0
                                  0, 1, 0, 0,     // row 1
                                  0, 0, 1, 0,     // row 2
diff --git a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp
index 6511b2344b..4cfb663658 100644
--- a/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp
+++ b/mlir/lib/Dialect/QCO/IR/Operations/StandardGates/XXPlusYYOp.cpp
@@ -79,10 +79,15 @@ std::optional<Matrix4x4> XXPlusYYOp::getUnitaryMatrix() {
     return std::nullopt;
   }
 
+  using namespace std::complex_literals;
   const auto mc = std::cos(*theta / 2);
   const auto s = std::sin(*theta / 2);
-  const auto msp = std::polar(s, *beta - (std::numbers::pi / 2));
-  const auto msm = std::polar(s, -*beta - (std::numbers::pi / 2));
+  // `std::polar` has undefined behavior for negative magnitudes (libc++ returns
+  // NaN), and `s = sin(theta / 2)` is negative for negative `theta`. Build the
+  // phased entries via `s * e^{i*phi}` instead, which is well-defined for any
+  // sign of `s`.
+  const auto msp = s * std::exp(1i * (*beta - (std::numbers::pi / 2)));
+  const auto msm = s * std::exp(1i * (-*beta - (std::numbers::pi / 2)));
   return Matrix4x4::fromElements(1, 0, 0, 0,    // row 0
                                  0, mc, msp, 0, // row 1
                                  0, msm, mc, 0, // row 2
diff --git a/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt b/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt
index 3564b55dc5..f02cdaf23b 100644
--- a/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/QCO/Transforms/CMakeLists.txt
@@ -22,11 +22,11 @@ add_mlir_library(
   DEPENDS
   MLIRQCOTransformsIncGen)
 
-# collect header files
-file(GLOB_RECURSE PASSES_HEADERS_SOURCE
-     ${MQT_MLIR_SOURCE_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.h)
-file(GLOB_RECURSE PASSES_HEADERS_BUILD
-     ${MQT_MLIR_BUILD_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.inc)
+# collect header files (subdirs: NativeSynthesis/, Decomposition/, …)
+file(GLOB_RECURSE PASSES_HEADERS_SOURCE CONFIGURE_DEPENDS
+     "${MQT_MLIR_SOURCE_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.h")
+file(GLOB_RECURSE PASSES_HEADERS_BUILD CONFIGURE_DEPENDS
+     "${MQT_MLIR_BUILD_INCLUDE_DIR}/mlir/Dialect/QCO/Transforms/*.inc")
 
 # add public headers using file sets
 target_sources(
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.cpp
new file mode 100644
index 0000000000..d8e837c7ba
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.cpp
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.h"
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/Twine.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/Support/LLVM.h>
+
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <numbers>
+#include <optional>
+#include <utility>
+
+namespace mlir::qco::decomposition {
+
+using namespace std::complex_literals;
+
+TwoQubitBasisDecomposer
+TwoQubitBasisDecomposer::create(const Matrix4x4& basisMatrix,
+                                double basisFidelity) {
+  const Matrix2x2 k12RArr = Matrix2x2::fromElements(
+      1i * FRAC1_SQRT2, FRAC1_SQRT2, -FRAC1_SQRT2, -1i * FRAC1_SQRT2);
+  const Matrix2x2 k12LArr =
+      Matrix2x2::fromElements(Complex{0.5, 0.5}, Complex{0.5, 0.5},
+                              Complex{-0.5, 0.5}, Complex{0.5, -0.5});
+
+  // The Shende-Markov-Bullock 3-CX sandwich (and its 1/2-CX reductions) used
+  // below is derived for a basis CX whose 4x4 matrix is the Qiskit/LSB form
+  // `[[1,0,0,0],[0,0,0,1],[0,0,1,0],[0,1,0,0]]`, i.e. "control on the LSB
+  // factor, target on the MSB factor" of the tensor product. MQT's wider
+  // convention places operand 0 on the MSB factor, so the CX/CZ matrix for
+  // control-on-wire-0 gives the SWAP-conjugate
+  // `[[1,0,0,0],[0,1,0,0],[0,0,0,1],[0,0,1,0]]`.
+  //
+  // Because `SWAP * C(a,b,c) * SWAP = C(a,b,c)` but
+  // `SWAP * (K1l ⊗ K1r) * SWAP = (K1r ⊗ K1l)`, feeding the MSB matrix directly
+  // into the Weyl decomposer would swap the roles of `k1l`/`k1r` (and `k2l`/
+  // `k2r`) relative to the hard-coded constants above. To keep the SMB algebra
+  // self-consistent we SWAP-conjugate the basis matrix here (restoring the
+  // Qiskit/LSB 4x4) and then absorb the resulting "left/right" relabeling at
+  // the emission boundary in `decomp{0,1,2,3}` below. This reproduces the
+  // pre-flip gate counts without having to re-derive every SMB constant for
+  // the MSB basis -- the two routes are algebraically equivalent.
+  const Matrix4x4 basisMatrixLsb = swapGate() * basisMatrix * swapGate();
+  const auto basisDecomposer = decomposition::TwoQubitWeylDecomposition::create(
+      basisMatrixLsb, basisFidelity);
+  const auto isSuperControlled =
+      relativeEq(basisDecomposer.a(), std::numbers::pi / 4.0, 1e-13, 1e-09) &&
+      relativeEq(basisDecomposer.c(), 0.0, 1e-13, 1e-09);
+
+  // Create some useful matrices U1, U2, U3 are equivalent to the basis,
+  // expand as Ui = Ki1.Ubasis.Ki2
+  auto b = basisDecomposer.b();
+  Complex temp{0.5, -0.5};
+  const Matrix2x2 k11l = Matrix2x2::fromElements(
+      temp * (-1i * std::exp(-1i * b)), temp * std::exp(-1i * b),
+      temp * (-1i * std::exp(1i * b)), temp * -std::exp(1i * b));
+  const Matrix2x2 k11r = Matrix2x2::fromElements(
+      FRAC1_SQRT2 * (1i * std::exp(-1i * b)), FRAC1_SQRT2 * -std::exp(-1i * b),
+      FRAC1_SQRT2 * std::exp(1i * b), FRAC1_SQRT2 * (-1i * std::exp(1i * b)));
+  const Matrix2x2 k32lK21l =
+      Matrix2x2::fromElements(FRAC1_SQRT2 * Complex{1., std::cos(2. * b)},
+                              FRAC1_SQRT2 * (1i * std::sin(2. * b)),
+                              FRAC1_SQRT2 * (1i * std::sin(2. * b)),
+                              FRAC1_SQRT2 * Complex{1., -std::cos(2. * b)});
+  temp = Complex{0.5, 0.5};
+  const Matrix2x2 k21r = Matrix2x2::fromElements(
+      temp * (-1i * std::exp(-2i * b)), temp * std::exp(-2i * b),
+      temp * (1i * std::exp(2i * b)), temp * std::exp(2i * b));
+  const Matrix2x2 k22l = Matrix2x2::fromElements(FRAC1_SQRT2, -FRAC1_SQRT2,
+                                                 FRAC1_SQRT2, FRAC1_SQRT2);
+  const Matrix2x2 k22r = Matrix2x2::fromElements(0, 1, -1, 0);
+  const Matrix2x2 k31l = Matrix2x2::fromElements(
+      FRAC1_SQRT2 * std::exp(-1i * b), FRAC1_SQRT2 * std::exp(-1i * b),
+      FRAC1_SQRT2 * -std::exp(1i * b), FRAC1_SQRT2 * std::exp(1i * b));
+  const Matrix2x2 k31r = Matrix2x2::fromElements(1i * std::exp(1i * b), 0, 0,
+                                                 -1i * std::exp(-1i * b));
+  const Matrix2x2 k32r = Matrix2x2::fromElements(
+      temp * std::exp(1i * b), temp * -std::exp(-1i * b),
+      temp * (-1i * std::exp(1i * b)), temp * (-1i * std::exp(-1i * b)));
+  auto k1lDagger = basisDecomposer.k1l().adjoint();
+  auto k1rDagger = basisDecomposer.k1r().adjoint();
+  auto k2lDagger = basisDecomposer.k2l().adjoint();
+  auto k2rDagger = basisDecomposer.k2r().adjoint();
+  // Pre-build the fixed parts of the matrices used in 3-part decomposition
+  auto u0l = k31l * k1lDagger;
+  auto u0r = k31r * k1rDagger;
+  auto u1l = k2lDagger * k32lK21l * k1lDagger;
+  auto u1ra = k2rDagger * k32r;
+  auto u1rb = k21r * k1rDagger;
+  auto u2la = k2lDagger * k22l;
+  auto u2lb = k11l * k1lDagger;
+  auto u2ra = k2rDagger * k22r;
+  auto u2rb = k11r * k1rDagger;
+  auto u3l = k2lDagger * k12LArr;
+  auto u3r = k2rDagger * k12RArr;
+  // Pre-build the fixed parts of the matrices used in the 2-part decomposition
+  auto q0l = k12LArr.adjoint() * k1lDagger;
+  auto q0r = k12RArr.adjoint() * ipz() * k1rDagger;
+  auto q1la = k2lDagger * k11l.adjoint();
+  auto q1lb = k11l * k1lDagger;
+  auto q1ra = k2rDagger * ipz() * k11r.adjoint();
+  auto q1rb = k11r * k1rDagger;
+  auto q2l = k2lDagger * k12LArr;
+  auto q2r = k2rDagger * k12RArr;
+
+  return TwoQubitBasisDecomposer{
+      basisFidelity,
+      basisDecomposer,
+      isSuperControlled,
+      u0l,
+      u0r,
+      u1l,
+      u1ra,
+      u1rb,
+      u2la,
+      u2lb,
+      u2ra,
+      u2rb,
+      u3l,
+      u3r,
+      q0l,
+      q0r,
+      q1la,
+      q1lb,
+      q1ra,
+      q1rb,
+      q2l,
+      q2r,
+  };
+}
+
+std::optional<TwoQubitNativeDecomposition>
+TwoQubitBasisDecomposer::twoQubitDecompose(
+    const decomposition::TwoQubitWeylDecomposition& targetDecomposition,
+    std::optional<std::uint8_t> numBasisGateUses) const {
+  auto traces = this->traces(targetDecomposition);
+  auto getDefaultNbasis = [&]() -> std::uint8_t {
+    // Pick the number of basis gate uses `i ∈ {0, 1, 2, 3}` that maximizes
+    //   expected_fidelity(i) = traceToFidelity(traces[i]) * basisFidelity^i.
+    auto bestValue = std::numeric_limits<double>::lowest();
+    auto bestIndex = -1;
+    for (int i = 0; std::cmp_less(i, traces.size()); ++i) {
+      auto value =
+          helpers::traceToFidelity(traces[i]) * std::pow(basisFidelity, i);
+      if (std::isnan(value)) {
+        continue;
+      }
+      if (value > bestValue) {
+        bestIndex = i;
+        bestValue = value;
+      }
+    }
+    if (bestIndex < 0) {
+      llvm::reportFatalInternalError("Unable to select basis-gate count: all "
+                                     "candidate fidelities are NaN");
+    }
+    return static_cast<std::uint8_t>(bestIndex);
+  };
+  // number of basis gates that need to be used in the decomposition
+  auto bestNbasis = numBasisGateUses.value_or(getDefaultNbasis());
+  if (bestNbasis > 1 && !isSuperControlled) {
+    // cannot reliably decompose with more than one basis gate and a
+    // non-super-controlled basis gate
+    return std::nullopt;
+  }
+  auto chooseDecomposition = [&]() {
+    if (bestNbasis == 0) {
+      return decomp0(targetDecomposition);
+    }
+    if (bestNbasis == 1) {
+      return decomp1(targetDecomposition);
+    }
+    if (bestNbasis == 2) {
+      return decomp2Supercontrolled(targetDecomposition);
+    }
+    if (bestNbasis == 3) {
+      return decomp3Supercontrolled(targetDecomposition);
+    }
+    llvm::reportFatalInternalError(
+        "Invalid number of basis gates to use in basis decomposition (" +
+        llvm::Twine(bestNbasis) + ")!");
+    llvm_unreachable("");
+  };
+  TwoQubitLocalUnitaryList factors = chooseDecomposition();
+#ifndef NDEBUG
+  for (const auto& factor : factors) {
+    assert(helpers::isUnitaryMatrix(factor));
+  }
+#endif
+
+  double globalPhase = targetDecomposition.globalPhase();
+  globalPhase -= bestNbasis * basisDecomposer.globalPhase();
+  if (bestNbasis == 2) {
+    // The two-basis (2x CX/CZ) template in `decomp2Supercontrolled` produces
+    // a sequence whose global phase is off by `pi` relative to the target;
+    // compensate here so the emitted sequence reproduces the target unitary
+    // exactly, not just up to sign.
+    globalPhase += std::numbers::pi;
+  }
+  // large global phases can be generated by the decomposition, thus limit
+  // it to [0, +2*pi)
+  globalPhase = helpers::remEuclid(globalPhase, 2.0 * std::numbers::pi);
+
+  return TwoQubitNativeDecomposition{
+      .numBasisUses = bestNbasis,
+      .singleQubitFactors = std::move(factors),
+      .globalPhase = globalPhase,
+  };
+}
+
+// Ported SMB helpers assume Qiskit Weyl k-factor layout; QCO 4x4 input order
+// swaps l/r vs that port. Swap k1l<->k1r and k2l<->k2r when reading ``target``,
+// and swap adjacent pairs in each return vector so the emission boundary maps
+// matrices to the same wires as the upstream decomposer. ``decomp0`` cancels to
+// the unswapped formula.
+TwoQubitLocalUnitaryList
+TwoQubitBasisDecomposer::decomp0(const TwoQubitWeylDecomposition& target) {
+  return TwoQubitLocalUnitaryList{
+      target.k1r() * target.k2r(),
+      target.k1l() * target.k2l(),
+  };
+}
+
+TwoQubitLocalUnitaryList TwoQubitBasisDecomposer::decomp1(
+    const TwoQubitWeylDecomposition& target) const {
+  // may not work for z != 0 and c != 0 (not always in Weyl chamber)
+  return TwoQubitLocalUnitaryList{
+      basisDecomposer.k2l().adjoint() * target.k2r(),
+      basisDecomposer.k2r().adjoint() * target.k2l(),
+      target.k1r() * basisDecomposer.k1l().adjoint(),
+      target.k1l() * basisDecomposer.k1r().adjoint(),
+  };
+}
+
+TwoQubitLocalUnitaryList TwoQubitBasisDecomposer::decomp2Supercontrolled(
+    const TwoQubitWeylDecomposition& target) const {
+  if (!isSuperControlled) {
+    llvm::reportFatalInternalError(
+        "Basis gate of TwoQubitBasisDecomposer is not super-controlled "
+        "- no guarantee for exact decomposition with two basis gates");
+  }
+  return TwoQubitLocalUnitaryList{
+      q2l * target.k2r(),
+      q2r * target.k2l(),
+      q1la * rzMatrix(-2. * target.a()) * q1lb,
+      q1ra * rzMatrix(2. * target.b()) * q1rb,
+      target.k1r() * q0l,
+      target.k1l() * q0r,
+  };
+}
+
+TwoQubitLocalUnitaryList TwoQubitBasisDecomposer::decomp3Supercontrolled(
+    const TwoQubitWeylDecomposition& target) const {
+  if (!isSuperControlled) {
+    llvm::reportFatalInternalError(
+        "Basis gate of TwoQubitBasisDecomposer is not super-controlled "
+        "- no guarantee for exact decomposition with three basis gates");
+  }
+  return TwoQubitLocalUnitaryList{
+      u3l * target.k2r(),
+      u3r * target.k2l(),
+      u2la * rzMatrix(-2. * target.a()) * u2lb,
+      u2ra * rzMatrix(2. * target.b()) * u2rb,
+      u1l,
+      u1ra * rzMatrix(-2. * target.c()) * u1rb,
+      target.k1r() * u0l,
+      target.k1l() * u0r,
+  };
+}
+
+std::array<std::complex<double>, 4>
+TwoQubitBasisDecomposer::traces(const TwoQubitWeylDecomposition& target) const {
+  // Returns the Hilbert-Schmidt traces between the target canonical gate and
+  // the best candidate reachable with `0, 1, 2, 3` uses of the basis gate,
+  // respectively. Fed into `traceToFidelity` by `getDefaultNbasis` to pick
+  // the best basis-gate count. The closed-form expressions specialize
+  // `TwoQubitWeylDecomposition::getTrace(a, b, c, ap, bp, cp)` for:
+  //   i == 0: no basis gate       (ap == bp == cp == 0)
+  //   i == 1: one basis use       (ap == pi/4, bp == basis.b, cp == 0)
+  //   i == 2: two basis uses      (ap == 0, bp == 0, cp == -target.c)
+  //   i == 3: three basis uses    (target reachable exactly -> trace == 4)
+  // so the array has length 4 and is indexed by the number of basis uses.
+  return {
+      4. * std::complex<double>{std::cos(target.a()) * std::cos(target.b()) *
+                                    std::cos(target.c()),
+                                std::sin(target.a()) * std::sin(target.b()) *
+                                    std::sin(target.c())},
+      4. *
+          std::complex<double>{std::cos((std::numbers::pi / 4.0) - target.a()) *
+                                   std::cos(basisDecomposer.b() - target.b()) *
+                                   std::cos(target.c()),
+                               std::sin((std::numbers::pi / 4.0) - target.a()) *
+                                   std::sin(basisDecomposer.b() - target.b()) *
+                                   std::sin(target.c())},
+      std::complex<double>{4. * std::cos(target.c()), 0.},
+      std::complex<double>{4., 0.},
+  };
+}
+
+bool TwoQubitBasisDecomposer::relativeEq(double lhs, double rhs, double epsilon,
+                                         double maxRelative) {
+  // Handle same infinities
+  if (lhs == rhs) {
+    return true;
+  }
+
+  // Handle remaining infinities
+  if (std::isinf(lhs) || std::isinf(rhs)) {
+    return false;
+  }
+
+  auto absDiff = std::abs(lhs - rhs);
+
+  // For when the numbers are really close together
+  if (absDiff <= epsilon) {
+    return true;
+  }
+
+  auto absLhs = std::abs(lhs);
+  auto absRhs = std::abs(rhs);
+  if (absRhs > absLhs) {
+    return absDiff <= absRhs * maxRelative;
+  }
+  return absDiff <= absLhs * maxRelative;
+}
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp
index 085d493abf..5f017109c5 100644
--- a/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Euler.cpp
@@ -102,21 +102,6 @@ static void emitGPhaseIfNeeded(OpBuilder& builder, Location loc, double phase) {
 // Euler decomposition (angles)
 //===----------------------------------------------------------------------===//
 
-/**
- * @brief Euler angles `(theta, phi, lambda)` and global phase for a 2x2
- * unitary.
- */
-namespace {
-
-struct EulerAngles {
-  double theta = 0.0;  ///< Middle rotation angle.
-  double phi = 0.0;    ///< First outer rotation angle.
-  double lambda = 0.0; ///< Second outer rotation angle.
-  double phase = 0.0;  ///< Global phase in radians.
-};
-
-} // namespace
-
 /**
  * @brief Z-Y-Z Euler angles and global phase for a 2x2 unitary.
  *
@@ -197,15 +182,7 @@ struct EulerAngles {
           .phase = phase - (0.5 * (phi + lambda))};
 }
 
-/**
- * @brief Extracts `(theta, phi, lambda, phase)` for all Euler bases.
- *
- * @param matrix The single-qubit unitary to decompose.
- * @param basis The target Euler basis.
- * @return The extracted Euler angles and global phase.
- */
-[[nodiscard]] static EulerAngles anglesFromUnitary(const Matrix2x2& matrix,
-                                                   const EulerBasis basis) {
+EulerAngles anglesFromUnitary(const Matrix2x2& matrix, const EulerBasis basis) {
   switch (basis) {
   case EulerBasis::ZYZ:
   case EulerBasis::ZSXX:
@@ -215,6 +192,9 @@ struct EulerAngles {
   case EulerBasis::XZX:
     return paramsXZX(matrix);
   case EulerBasis::XYX:
+  case EulerBasis::R:
+    // The `R` basis reuses the X-Y-X angles and lowers `Rx`/`Ry` to the native
+    // `R(theta, phi)` gate (`Rx(a) == R(a, 0)`, `Ry(a) == R(a, pi/2)`).
     return paramsXYX(matrix);
   case EulerBasis::U:
     return paramsU(matrix);
@@ -235,7 +215,7 @@ namespace {
  * `RZ`/`RY`/`RX` use @p theta as the rotation angle; `U` uses all three angles.
  */
 struct SynthesisStep {
-  enum class Kind : std::uint8_t { RZ, RY, RX, SX, X, U };
+  enum class Kind : std::uint8_t { RZ, RY, RX, SX, X, U, R };
 
   Kind kind = Kind::RZ;
   double theta = 0.0;
@@ -264,6 +244,19 @@ struct Unitary1QEulerPlan {
     }
   }
 
+  /**
+   * @brief Appends a native `R(angle, axis)` step for non-negligible angles.
+   *
+   * @param angle The rotation angle in radians.
+   * @param axis The rotation axis in the XY-plane (`0` for `Rx`, `pi/2` for
+   *             `Ry`).
+   */
+  void appendRStep(const double angle, const double axis) {
+    if (!isNearZeroRotationAngle(angle)) {
+      steps.emplace_back(SynthesisStep::Kind::R, angle, axis);
+    }
+  }
+
   /**
    * @brief Appends the decomposition for @p basis based on @p angles.
    *
@@ -290,6 +283,9 @@ struct Unitary1QEulerPlan {
       case EulerBasis::XYX:
         appendRotation(SynthesisStep::Kind::RX, angles.phi + angles.lambda);
         break;
+      case EulerBasis::R:
+        appendRStep(angles.phi + angles.lambda, 0.0);
+        break;
       case EulerBasis::U:
         steps.emplace_back(SynthesisStep::Kind::U, 0.0, angles.phi,
                            angles.lambda);
@@ -324,6 +320,14 @@ struct Unitary1QEulerPlan {
       appendRotation(SynthesisStep::Kind::RX, angles.phi);
       phase = angles.phase;
       break;
+    case EulerBasis::R:
+      // X-Y-X with `Rx(a) == R(a, 0)` and `Ry(a) == R(a, pi/2)`.
+      appendRStep(angles.lambda, 0.0);
+      steps.emplace_back(SynthesisStep::Kind::R, angles.theta,
+                         std::numbers::pi / 2.0);
+      appendRStep(angles.phi, 0.0);
+      phase = angles.phase;
+      break;
     case EulerBasis::U:
       steps.emplace_back(SynthesisStep::Kind::U, angles.theta, angles.phi,
                          angles.lambda);
@@ -413,6 +417,9 @@ emitUnitary1QEulerPlan(OpBuilder& builder, Location loc, Value qubit,
       qubit =
           UOp::create(builder, loc, qubit, theta, phi, lambda).getQubitOut();
       break;
+    case SynthesisStep::Kind::R:
+      qubit = ROp::create(builder, loc, qubit, theta, phi).getQubitOut();
+      break;
     }
   }
   emitGPhaseIfNeeded(builder, loc, plan.phase);
@@ -427,6 +434,7 @@ std::optional<EulerBasis> parseEulerBasis(StringRef basis) {
       .Case("xyx", EulerBasis::XYX)
       .Case("u", EulerBasis::U)
       .Case("zsxx", EulerBasis::ZSXX)
+      .Case("r", EulerBasis::R)
       .Default(std::nullopt);
 }
 
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/Helpers.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Helpers.cpp
new file mode 100644
index 0000000000..9a449f3902
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/Helpers.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h"
+
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/Support/ErrorHandling.h>
+
+#include <cmath>
+#include <complex>
+
+namespace mlir::qco::helpers {
+
+bool isUnitaryMatrix(const Matrix2x2& matrix, double tolerance) {
+  return (matrix.adjoint() * matrix).isIdentity(tolerance);
+}
+
+double remEuclid(double a, double b) {
+  if (b == 0.0) {
+    llvm::reportFatalInternalError("remEuclid expects non-zero divisor");
+  }
+  auto r = std::fmod(a, b);
+  return (r < 0.0) ? r + std::abs(b) : r;
+}
+
+double traceToFidelity(const std::complex<double>& x) {
+  // Average two-qubit process fidelity given the Hilbert-Schmidt overlap
+  // `x = tr(U_target^dag * U_actual)`. For a 4x4 unitary the general formula is
+  // `F_avg = (d + |tr|^2) / (d * (d + 1))` with `d = 4`, which reduces to the
+  // `(4 + |x|^2) / 20` expression below. See e.g. Horodecki/Nielsen.
+  auto xAbs = std::abs(x);
+  return (4.0 + (xAbs * xAbs)) / 20.0;
+}
+
+std::complex<double> globalPhaseFactor(double globalPhase) {
+  return std::exp(std::complex<double>{0, 1} * globalPhase);
+}
+
+} // namespace mlir::qco::helpers
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.cpp
new file mode 100644
index 0000000000..4b2f88137b
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/ErrorHandling.h>
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+
+namespace mlir::qco::decomposition {
+
+Matrix2x2 uMatrix(double theta, double phi, double lambda) {
+  const auto cosHalf = std::cos(theta / 2.);
+  const auto sinHalf = std::sin(theta / 2.);
+  return Matrix2x2::fromElements(
+      Complex{cosHalf, 0.},
+      Complex{-std::cos(lambda) * sinHalf, -std::sin(lambda) * sinHalf},
+      Complex{std::cos(phi) * sinHalf, std::sin(phi) * sinHalf},
+      Complex{std::cos(lambda + phi) * cosHalf,
+              std::sin(lambda + phi) * cosHalf});
+}
+
+Matrix2x2 rxMatrix(double theta) {
+  const auto halfTheta = theta / 2.;
+  const Complex cos{std::cos(halfTheta), 0.};
+  const Complex isin{0., -std::sin(halfTheta)};
+  return Matrix2x2::fromElements(cos, isin, isin, cos);
+}
+
+Matrix2x2 ryMatrix(double theta) {
+  const auto halfTheta = theta / 2.;
+  const Complex cos{std::cos(halfTheta), 0.};
+  const Complex sin{std::sin(halfTheta), 0.};
+  return Matrix2x2::fromElements(cos, -sin, sin, cos);
+}
+
+Matrix2x2 rzMatrix(double theta) {
+  return Matrix2x2::fromElements(
+      Complex{std::cos(theta / 2.), -std::sin(theta / 2.)}, 0., 0.,
+      Complex{std::cos(theta / 2.), std::sin(theta / 2.)});
+}
+
+Matrix4x4 rxxMatrix(double theta) {
+  const auto cosTheta = std::cos(theta / 2.);
+  const Complex misin{0., -std::sin(theta / 2.)};
+  return Matrix4x4::fromElements(cosTheta, 0, 0, misin, //
+                                 0, cosTheta, misin, 0, //
+                                 0, misin, cosTheta, 0, //
+                                 misin, 0, 0, cosTheta);
+}
+
+Matrix4x4 ryyMatrix(double theta) {
+  const auto cosTheta = std::cos(theta / 2.);
+  const Complex isin{0., std::sin(theta / 2.)};
+  const Complex misin{0., -std::sin(theta / 2.)};
+  return Matrix4x4::fromElements(cosTheta, 0, 0, isin,  //
+                                 0, cosTheta, misin, 0, //
+                                 0, misin, cosTheta, 0, //
+                                 isin, 0, 0, cosTheta);
+}
+
+Matrix4x4 rzzMatrix(double theta) {
+  const auto cosTheta = std::cos(theta / 2.);
+  const auto sinTheta = std::sin(theta / 2.);
+  const Complex em{cosTheta, -sinTheta};
+  const Complex ep{cosTheta, sinTheta};
+  return Matrix4x4::fromElements(em, 0, 0, 0, //
+                                 0, ep, 0, 0, //
+                                 0, 0, ep, 0, //
+                                 0, 0, 0, em);
+}
+
+Matrix2x2 pMatrix(double lambda) {
+  return Matrix2x2::fromElements(1., 0., 0.,
+                                 Complex{std::cos(lambda), std::sin(lambda)});
+}
+
+const Matrix4x4& swapGate() {
+  static const Matrix4x4 matrix = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                                          0, 0, 1, 0, //
+                                                          0, 1, 0, 0, //
+                                                          0, 0, 0, 1);
+  return matrix;
+}
+
+const Matrix2x2& hGate() {
+  static const Matrix2x2 matrix = Matrix2x2::fromElements(
+      FRAC1_SQRT2, FRAC1_SQRT2, FRAC1_SQRT2, -FRAC1_SQRT2);
+  return matrix;
+}
+
+const Matrix2x2& ipz() {
+  static const Matrix2x2 matrix =
+      Matrix2x2::fromElements(Complex{0, 1}, 0, 0, Complex{0, -1});
+  return matrix;
+}
+
+const Matrix2x2& ipy() {
+  static const Matrix2x2 matrix = Matrix2x2::fromElements(0, 1, -1, 0);
+  return matrix;
+}
+
+const Matrix2x2& ipx() {
+  static const Matrix2x2 matrix =
+      Matrix2x2::fromElements(0, Complex{0, 1}, Complex{0, 1}, 0);
+  return matrix;
+}
+
+const Matrix4x4& cxGate01() {
+  static const Matrix4x4 matrix = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                                          0, 1, 0, 0, //
+                                                          0, 0, 0, 1, //
+                                                          0, 0, 1, 0);
+  return matrix;
+}
+
+const Matrix4x4& cxGate10() {
+  static const Matrix4x4 matrix = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                                          0, 0, 0, 1, //
+                                                          0, 0, 1, 0, //
+                                                          0, 1, 0, 0);
+  return matrix;
+}
+
+const Matrix4x4& czGate() {
+  static const Matrix4x4 matrix = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                                          0, 1, 0, 0, //
+                                                          0, 0, 1, 0, //
+                                                          0, 0, 0, -1);
+  return matrix;
+}
+
+Matrix4x4 expandToTwoQubits(const Matrix2x2& singleQubitMatrix,
+                            QubitId qubitId) {
+  if (qubitId == 0) {
+    return kron(singleQubitMatrix, Matrix2x2::identity());
+  }
+  if (qubitId == 1) {
+    return kron(Matrix2x2::identity(), singleQubitMatrix);
+  }
+  llvm::reportFatalInternalError("Invalid qubit id for single-qubit expansion");
+}
+
+Matrix4x4
+fixTwoQubitMatrixQubitOrder(const Matrix4x4& twoQubitMatrix,
+                            const llvm::SmallVector<QubitId, 2>& qubitIds) {
+  if (qubitIds == llvm::SmallVector<QubitId, 2>{1, 0}) {
+    // `UnitaryOpInterface::getUnitaryMatrix4x4` uses a fixed index order;
+    // conjugate by SWAP when operand order is (1, 0) instead of (0, 1).
+    return swapGate() * twoQubitMatrix * swapGate();
+  }
+  if (qubitIds == llvm::SmallVector<QubitId, 2>{0, 1}) {
+    return twoQubitMatrix;
+  }
+  llvm::reportFatalInternalError(
+      "Invalid qubit IDs for fixing two-qubit matrix");
+}
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/lib/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.cpp b/mlir/lib/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.cpp
new file mode 100644
index 0000000000..4a1c5798e9
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.cpp
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h"
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/Support/ErrorHandling.h>
+#include <llvm/Support/FormatVariadic.h>
+#include <mlir/Support/LLVM.h>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <numbers>
+#include <optional>
+#include <random>
+#include <tuple>
+#include <utility>
+
+namespace mlir::qco::decomposition {
+
+using namespace std::complex_literals;
+
+TwoQubitWeylDecomposition
+TwoQubitWeylDecomposition::create(const Matrix4x4& unitaryMatrix,
+                                  std::optional<double> fidelity) {
+  auto u = unitaryMatrix;
+  auto detU = u.determinant();
+  // Project into SU(4) by dividing out the fourth root of det(U): for a 4x4
+  // unitary, |det(U)| == 1 so `det^{-1/4}` both enforces det == 1 and removes
+  // the global phase. The extracted phase is tracked separately in
+  // `globalPhase` (quarter of arg(det) to match the fourth-root choice) so the
+  // caller can reconstruct the original matrix exactly if needed.
+  auto detPow = std::pow(detU, -0.25);
+  u *= detPow; // remove global phase from unitary matrix
+  auto globalPhase = std::arg(detU) / 4.;
+
+  // Numerical drift can still leave tiny determinant errors after root
+  // normalization. Re-normalize once more instead of aborting.
+  auto detNormalized = u.determinant();
+  if (std::abs(detNormalized - Complex{1.0, 0.0}) > SANITY_CHECK_PRECISION &&
+      std::abs(detNormalized) > SANITY_CHECK_PRECISION) {
+    u *= std::pow(detNormalized, -0.25);
+  }
+
+  // transform unitary matrix to magic basis; this enables two properties:
+  // 1. if uP ∈ SO(4), V = A ⊗ B (SO(4) → SU(2) ⊗ SU(2))
+  // 2. magic basis diagonalizes canonical gate, allowing calculation of
+  //    canonical gate parameters later on
+  auto uP = magicBasisTransform(u, MagicBasisTransform::OutOf);
+  const Matrix4x4 m2 = uP.transpose() * uP;
+
+  // diagonalization yields eigenvectors (p) and eigenvalues (d);
+  // p is used to calculate K1/K2 (and thus the single-qubit gates
+  // surrounding the canonical gate); d is used to determine the Weyl
+  // coordinates and thus the parameters of the canonical gate
+  auto [p, d] = diagonalizeComplexSymmetric(m2, DIAGONALIZATION_PRECISION);
+
+  // extract Weyl coordinates from eigenvalues, map to [0, 2*pi)
+  constexpr double pi = std::numbers::pi;
+  std::array<double, 4> dReal{};
+  for (std::size_t i = 0; i < d.size(); ++i) {
+    dReal[i] = -std::arg(d[i]) / 2.0;
+  }
+  dReal[3] = -dReal[0] - dReal[1] - dReal[2];
+  std::array<double, 3> cs{};
+  for (std::size_t i = 0; i < cs.size(); ++i) {
+    cs[i] = helpers::remEuclid((dReal[i] + dReal[3]) / 2.0, 2.0 * pi);
+  }
+
+  // Reorder coordinates according to min(a, pi/2 - a) with
+  // a = x mod pi/2 for each Weyl coordinate x
+  std::array<double, 3> cstemp{};
+  for (std::size_t i = 0; i < cs.size(); ++i) {
+    const auto tmp = helpers::remEuclid(cs[i], pi / 2.0);
+    cstemp[i] = std::min(tmp, (pi / 2.0) - tmp);
+  }
+  std::array<std::size_t, 3> order{0, 1, 2};
+  std::stable_sort(order.begin(), order.end(),
+                   [&](auto a, auto b) { return cstemp[a] < cstemp[b]; });
+  order = {order[1], order[2], order[0]};
+  cs = {cs[order[0]], cs[order[1]], cs[order[2]]};
+  {
+    const std::array<double, 3> reordered{dReal[order[0]], dReal[order[1]],
+                                          dReal[order[2]]};
+    dReal[0] = reordered[0];
+    dReal[1] = reordered[1];
+    dReal[2] = reordered[2];
+  }
+
+  // update eigenvectors (columns of p) according to new order of
+  // weyl coordinates
+  const Matrix4x4 pOrig = p;
+  for (std::size_t i = 0; i < order.size(); ++i) {
+    p.setColumn(i, pOrig.column(order[i]));
+  }
+  // apply correction for determinant if necessary
+  if (p.determinant().real() < 0.0) {
+    auto lastColumn = p.column(3);
+    for (auto& entry : lastColumn) {
+      entry = -entry;
+    }
+    p.setColumn(3, lastColumn);
+  }
+  assert(std::abs(p.determinant() - 1.0) < SANITY_CHECK_PRECISION);
+
+  // re-create complex eigenvalue matrix; this matrix contains the
+  // parameters of the canonical gate which is later used in the
+  // verification. Since the matrix is diagonal, the matrix exponential is
+  // equivalent to the element-wise exponential function.
+  std::array<Complex, 4> tempDiag{};
+  for (std::size_t k = 0; k < tempDiag.size(); ++k) {
+    tempDiag[k] = std::exp(1i * dReal[k]);
+  }
+  const Matrix4x4 temp = Matrix4x4::fromDiagonal(tempDiag);
+
+  // combined matrix k1 of 1q gates after canonical gate
+  Matrix4x4 k1 = uP * p * temp;
+  // k1 must be orthogonal; the tolerance matches the iterative diagonalization
+  // residual rather than the (much tighter) default matrix tolerance.
+  assert((k1.transpose() * k1).isIdentity(SANITY_CHECK_PRECISION));
+  assert(k1.determinant().real() > 0.0);
+  k1 = magicBasisTransform(k1, MagicBasisTransform::Into);
+
+  // combined matrix k2 of 1q gates before canonical gate
+  Matrix4x4 k2 = p.adjoint();
+  // k2 must be orthogonal; see the tolerance note on the k1 check above.
+  assert((k2.transpose() * k2).isIdentity(SANITY_CHECK_PRECISION));
+  assert(k2.determinant().real() > 0.0);
+  k2 = magicBasisTransform(k2, MagicBasisTransform::Into);
+
+  // ensure k1 and k2 are correct (when combined with the canonical gate
+  // parameters in-between, they are equivalent to u)
+  std::array<Complex, 4> tempConjDiag{};
+  for (std::size_t k = 0; k < tempConjDiag.size(); ++k) {
+    tempConjDiag[k] = std::conj(tempDiag[k]);
+  }
+  assert((k1 *
+          magicBasisTransform(Matrix4x4::fromDiagonal(tempConjDiag),
+                              MagicBasisTransform::Into) *
+          k2)
+             .isApprox(u, SANITY_CHECK_PRECISION));
+
+  // calculate k1 = K1l ⊗ K1r
+  auto [K1l, K1r, phaseL] = decomposeTwoQubitProductGate(k1);
+  // decompose k2 = K2l ⊗ K2r
+  auto [K2l, K2r, phaseR] = decomposeTwoQubitProductGate(k2);
+  assert(kron(K1l, K1r).isApprox(k1, SANITY_CHECK_PRECISION));
+  assert(kron(K2l, K2r).isApprox(k2, SANITY_CHECK_PRECISION));
+  // accumulate global phase
+  globalPhase += phaseL + phaseR;
+
+  // Flip into Weyl chamber
+  if (cs[0] > (pi / 2.0)) {
+    cs[0] -= 3.0 * (pi / 2.0);
+    K1l = K1l * ipy();
+    K1r = K1r * ipy();
+    globalPhase += (pi / 2.0);
+  }
+  if (cs[1] > (pi / 2.0)) {
+    cs[1] -= 3.0 * (pi / 2.0);
+    K1l = K1l * ipx();
+    K1r = K1r * ipx();
+    globalPhase += (pi / 2.0);
+  }
+  auto conjs = 0;
+  if (cs[0] > (pi / 4.0)) {
+    cs[0] = (pi / 2.0) - cs[0];
+    K1l = K1l * ipy();
+    K2r = ipy() * K2r;
+    conjs += 1;
+    globalPhase -= (pi / 2.0);
+  }
+  if (cs[1] > (pi / 4.0)) {
+    cs[1] = (pi / 2.0) - cs[1];
+    K1l = K1l * ipx();
+    K2r = ipx() * K2r;
+    conjs += 1;
+    globalPhase += (pi / 2.0);
+    if (conjs == 1) {
+      globalPhase -= pi;
+    }
+  }
+  if (cs[2] > (pi / 2.0)) {
+    cs[2] -= 3.0 * (pi / 2.0);
+    K1l = K1l * ipz();
+    K1r = K1r * ipz();
+    globalPhase += (pi / 2.0);
+    if (conjs == 1) {
+      globalPhase -= pi;
+    }
+  }
+  if (conjs == 1) {
+    cs[2] = (pi / 2.0) - cs[2];
+    K1l = K1l * ipz();
+    K2r = ipz() * K2r;
+    globalPhase += (pi / 2.0);
+  }
+  if (cs[2] > (pi / 4.0)) {
+    cs[2] -= (pi / 2.0);
+    K1l = K1l * ipz();
+    K1r = K1r * ipz();
+    globalPhase -= (pi / 2.0);
+  }
+
+  // bind weyl coordinates as parameters of canonical gate
+  auto [a, b, c] = std::tie(cs[1], cs[0], cs[2]);
+
+  TwoQubitWeylDecomposition decomposition;
+  decomposition.a_ = a;
+  decomposition.b_ = b;
+  decomposition.c_ = c;
+  decomposition.globalPhase_ = globalPhase;
+  decomposition.k1l_ = K1l;
+  decomposition.k2l_ = K2l;
+  decomposition.k1r_ = K1r;
+  decomposition.k2r_ = K2r;
+  decomposition.specialization = Specialization::General;
+  decomposition.requestedFidelity = fidelity;
+
+  // make sure decomposition is equal to input
+  assert((kron(K1l, K1r) * decomposition.getCanonicalMatrix() * kron(K2l, K2r) *
+          helpers::globalPhaseFactor(globalPhase))
+             .isApprox(unitaryMatrix, SANITY_CHECK_PRECISION));
+
+  // determine actual specialization of canonical gate so that the 1q
+  // matrices can potentially be simplified
+  auto flippedFromOriginal = decomposition.applySpecialization();
+
+  auto getTrace = [&]() {
+    if (flippedFromOriginal) {
+      return TwoQubitWeylDecomposition::getTrace(
+          (pi / 2.0) - a, b, -c, decomposition.a_, decomposition.b_,
+          decomposition.c_);
+    }
+    return TwoQubitWeylDecomposition::getTrace(
+        a, b, c, decomposition.a_, decomposition.b_, decomposition.c_);
+  };
+  // use trace to calculate fidelity of applied specialization and
+  // adjust global phase
+  auto trace = getTrace();
+  const double calculatedFidelity = helpers::traceToFidelity(trace);
+  // final check if specialization is close enough to the original matrix to
+  // satisfy the requested fidelity; since no forced specialization is
+  // allowed, this should never fail
+  if (decomposition.requestedFidelity &&
+      calculatedFidelity + 1.0e-13 < *decomposition.requestedFidelity) {
+    llvm::reportFatalInternalError(llvm::formatv(
+        "TwoQubitWeylDecomposition: Calculated fidelity of "
+        "specialization is worse than requested fidelity ({0:F4} vs {1:F4})!",
+        calculatedFidelity, *decomposition.requestedFidelity));
+  }
+  decomposition.globalPhase_ += std::arg(trace);
+
+  // final check if decomposition is still valid after specialization
+  assert((kron(decomposition.k1l_, decomposition.k1r_) *
+          decomposition.getCanonicalMatrix() *
+          kron(decomposition.k2l_, decomposition.k2r_) *
+          helpers::globalPhaseFactor(decomposition.globalPhase_))
+             .isApprox(unitaryMatrix, SANITY_CHECK_PRECISION));
+
+  return decomposition;
+}
+
+Matrix4x4 TwoQubitWeylDecomposition::getCanonicalMatrix(double a, double b,
+                                                        double c) {
+  // Canonical gate `U_d(a, b, c) = exp(i * (a*XX + b*YY + c*ZZ))`. XX/YY/ZZ
+  // commute pairwise, so any product order is equivalent; the order below is
+  // chosen to match common Qiskit/QuantumFlow references. The negated rotation
+  // angles (`-2 * a`, ...) compensate for the `RXX/RYY/RZZ` convention
+  // `exp(-i * theta/2 * XX)`, so that the factored angles sum back to the
+  // intended `+a`, `+b`, `+c`.
+  const auto xx = rxxMatrix(-2.0 * a);
+  const auto yy = ryyMatrix(-2.0 * b);
+  const auto zz = rzzMatrix(-2.0 * c);
+  return zz * yy * xx;
+}
+
+Matrix4x4
+TwoQubitWeylDecomposition::magicBasisTransform(const Matrix4x4& unitary,
+                                               MagicBasisTransform direction) {
+  // Makhlin "magic basis" transform. Conjugating a 2-qubit unitary by
+  // `bNonNormalized` maps SU(2) x SU(2) factors onto SO(4) and diagonalizes
+  // the canonical (Weyl) gate. The matrices are stored unnormalized: the
+  // `1/2` pre-factor that would normally appear in `B^dagger` is absorbed
+  // into `bNonNormalizedDagger` directly so the product `Bd * B == I`
+  // without an extra scalar.
+  const Matrix4x4 bNonNormalized = Matrix4x4::fromElements( //
+      1, 1i, 0, 0,                                          //
+      0, 0, 1i, 1,                                          //
+      0, 0, 1i, -1,                                         //
+      1, -1i, 0, 0);
+  const Matrix4x4 bNonNormalizedDagger = Matrix4x4::fromElements( //
+      0.5, 0, 0, 0.5,                                             //
+      Complex{0.0, -0.5}, 0, 0, Complex{0.0, 0.5},                //
+      0, Complex{0.0, -0.5}, Complex{0.0, -0.5}, 0,               //
+      0, 0.5, -0.5, 0);
+  if (direction == MagicBasisTransform::OutOf) {
+    return bNonNormalizedDagger * unitary * bNonNormalized;
+  }
+  if (direction == MagicBasisTransform::Into) {
+    return bNonNormalized * unitary * bNonNormalizedDagger;
+  }
+  llvm::reportFatalInternalError("Unknown MagicBasisTransform direction!");
+}
+
+double TwoQubitWeylDecomposition::closestPartialSwap(double a, double b,
+                                                     double c) {
+  auto m = (a + b + c) / 3.;
+  auto [am, bm, cm] = std::array{a - m, b - m, c - m};
+  auto [ab, bc, ca] = std::array{a - b, b - c, c - a};
+  return m + (am * bm * cm * (6. + (ab * ab) + (bc * bc) + (ca * ca)) / 18.);
+}
+
+std::pair<Matrix4x4, std::array<Complex, 4>>
+TwoQubitWeylDecomposition::diagonalizeComplexSymmetric(const Matrix4x4& m,
+                                                       double precision) {
+  // We can't use raw `eig` directly because it isn't guaranteed to give
+  // us real or orthogonal eigenvectors. Instead, since `M` is
+  // complex-symmetric,
+  //   M = A + iB
+  // for real-symmetric `A` and `B`, and as
+  //   M^+ @ M2 = A^2 + B^2 + i [A, B] = 1
+  // we must have `A` and `B` commute, and consequently they are
+  // simultaneously diagonalizable. Mixing them together _should_ account
+  // for any degeneracy problems, but it's not guaranteed, so we repeat it
+  // a little bit.  The fixed seed is to make failures deterministic; the
+  // value is not important.
+  auto state = std::mt19937{2023};
+  std::normal_distribution<double> dist;
+
+  const auto mReal = m.realPart();
+  const auto mImag = m.imagPart();
+
+  double bestErr = 1e300;
+  constexpr auto maxDiagonalizationAttempts = 100;
+  for (int i = 0; i < maxDiagonalizationAttempts; ++i) {
+    double randA{};
+    double randB{};
+    // For debugging the algorithm use the same RNG values as the
+    // Qiskit implementation for the first random trial.
+    // In most cases this loop only executes a single iteration and
+    // using the same rng values rules out possible RNG differences
+    // as the root cause of a test failure
+    if (i == 0) {
+      randA = 1.2602066112249388;
+      randB = 0.22317849046722027;
+    } else {
+      randA = dist(state);
+      randB = dist(state);
+    }
+    std::array<double, 16> m2Real{};
+    for (std::size_t k = 0; k < m2Real.size(); ++k) {
+      m2Real[k] = (randA * mReal[k]) + (randB * mImag[k]);
+    }
+    const Matrix4x4 p = jacobiSymmetricEigen(m2Real).eigenvectors;
+    const std::array<Complex, 4> d = (p.transpose() * m * p).diagonal();
+
+    const auto compare = p * Matrix4x4::fromDiagonal(d) * p.transpose();
+    {
+      double err = 0.0;
+      for (std::size_t r = 0; r < 4; ++r) {
+        for (std::size_t cc = 0; cc < 4; ++cc) {
+          err = std::max(err, std::abs(compare(r, cc) - m(r, cc)));
+        }
+      }
+      bestErr = std::min(bestErr, err);
+    }
+    if (compare.isApprox(m, precision)) {
+      // p are the eigenvectors which are decomposed into the
+      // single-qubit gates surrounding the canonical gate
+      // d is the sqrt of the eigenvalues that are used to determine the
+      // weyl coordinates and thus the parameters of the canonical gate
+      // check that p is in SO(4)
+      assert((p.transpose() * p).isIdentity(SANITY_CHECK_PRECISION));
+      // make sure determinant of eigenvalues is 1.0
+      assert(std::abs(Matrix4x4::fromDiagonal(d).determinant() - 1.0) <
+             SANITY_CHECK_PRECISION);
+      return std::make_pair(p, d);
+    }
+  }
+  llvm::reportFatalInternalError(llvm::formatv(
+      "TwoQubitWeylDecomposition: failed to diagonalize M2 ({0} iterations). "
+      "best error = {1:e}, precision = {2:e}",
+      maxDiagonalizationAttempts, bestErr, precision));
+}
+
+std::tuple<Matrix2x2, Matrix2x2, double>
+TwoQubitWeylDecomposition::decomposeTwoQubitProductGate(
+    const Matrix4x4& specialUnitary) {
+  // for alternative approaches, see
+  // pennylane's math.decomposition.su2su2_to_tensor_products
+  // or quantumflow.kronecker_decomposition
+
+  // first quadrant
+  Matrix2x2 r =
+      Matrix2x2::fromElements(specialUnitary(0, 0), specialUnitary(0, 1),
+                              specialUnitary(1, 0), specialUnitary(1, 1));
+  auto detR = r.determinant();
+  if (std::abs(detR) < 0.1) {
+    // third quadrant
+    r = Matrix2x2::fromElements(specialUnitary(2, 0), specialUnitary(2, 1),
+                                specialUnitary(3, 0), specialUnitary(3, 1));
+    detR = r.determinant();
+  }
+  if (std::abs(detR) < 0.1) {
+    llvm::reportFatalInternalError(
+        "decomposeTwoQubitProductGate: unable to decompose: det_r < 0.1");
+  }
+  r *= (1.0 / std::sqrt(detR));
+  // transpose with complex conjugate of each element
+  const Matrix2x2 rTConj = r.adjoint();
+
+  Matrix4x4 temp = specialUnitary * kron(Matrix2x2::identity(), rTConj);
+
+  // [[a, b, c, d],
+  //  [e, f, g, h], => [[a, c],
+  //  [i, j, k, l],     [i, k]]
+  //  [m, n, o, p]]
+  Matrix2x2 l =
+      Matrix2x2::fromElements(temp(0, 0), temp(0, 2), temp(2, 0), temp(2, 2));
+  auto detL = l.determinant();
+  if (std::abs(detL) < 0.9) {
+    llvm::reportFatalInternalError(
+        "decomposeTwoQubitProductGate: unable to decompose: detL < 0.9");
+  }
+  l *= (1.0 / std::sqrt(detL));
+  auto phase = std::arg(detL) / 2.;
+
+  return {l, r, phase};
+}
+
+std::complex<double> TwoQubitWeylDecomposition::getTrace(double a, double b,
+                                                         double c, double ap,
+                                                         double bp, double cp) {
+  // Closed-form Hilbert-Schmidt overlap `tr(U_d(a,b,c)^dag * U_d(ap,bp,cp))`
+  // between two canonical (Weyl) gates, expressed in terms of the coordinate
+  // differences. Feeding the result into `traceToFidelity` gives the average
+  // two-qubit gate fidelity between the two canonical gates, which
+  // `bestSpecialization` uses to rank candidate specializations.
+  // Reference: Zhang et al., "Geometric theory of nonlocal two-qubit
+  // operations", Phys. Rev. A 67, 042313 (2003), Eq. (20).
+  auto da = a - ap;
+  auto db = b - bp;
+  auto dc = c - cp;
+  return 4. * std::complex<double>{std::cos(da) * std::cos(db) * std::cos(dc),
+                                   std::sin(da) * std::sin(db) * std::sin(dc)};
+}
+
+TwoQubitWeylDecomposition::Specialization
+TwoQubitWeylDecomposition::bestSpecialization() const {
+  auto isClose = [this](double ap, double bp, double cp) -> bool {
+    auto tr = getTrace(a_, b_, c_, ap, bp, cp);
+    if (requestedFidelity) {
+      return helpers::traceToFidelity(tr) >= *requestedFidelity;
+    }
+    return false;
+  };
+
+  auto closestAbc = closestPartialSwap(a_, b_, c_);
+  auto closestAbMinusC = closestPartialSwap(a_, b_, -c_);
+
+  if (isClose(0., 0., 0.)) {
+    return Specialization::IdEquiv;
+  }
+  if (isClose((std::numbers::pi / 4.0), (std::numbers::pi / 4.0),
+              (std::numbers::pi / 4.0)) ||
+      isClose((std::numbers::pi / 4.0), (std::numbers::pi / 4.0),
+              -(std::numbers::pi / 4.0))) {
+    return Specialization::SWAPEquiv;
+  }
+  if (isClose(closestAbc, closestAbc, closestAbc)) {
+    return Specialization::PartialSWAPEquiv;
+  }
+  if (isClose(closestAbMinusC, closestAbMinusC, -closestAbMinusC)) {
+    return Specialization::PartialSWAPFlipEquiv;
+  }
+  if (isClose(a_, 0., 0.)) {
+    return Specialization::ControlledEquiv;
+  }
+  if (isClose((std::numbers::pi / 4.0), (std::numbers::pi / 4.0), c_)) {
+    return Specialization::MirrorControlledEquiv;
+  }
+  if (isClose((a_ + b_) / 2., (a_ + b_) / 2., c_)) {
+    return Specialization::FSimaabEquiv;
+  }
+  if (isClose(a_, (b_ + c_) / 2., (b_ + c_) / 2.)) {
+    return Specialization::FSimabbEquiv;
+  }
+  if (isClose(a_, (b_ - c_) / 2., (c_ - b_) / 2.)) {
+    return Specialization::FSimabmbEquiv;
+  }
+  return Specialization::General;
+}
+
+bool TwoQubitWeylDecomposition::applySpecialization() {
+  if (specialization != Specialization::General) {
+    llvm::reportFatalInternalError(
+        "Application of specialization only works on "
+        "general Weyl decompositions!");
+  }
+  bool flippedFromOriginal = false;
+  auto newSpecialization = bestSpecialization();
+  if (newSpecialization == Specialization::General) {
+    // U has no special symmetry.
+    //
+    // This gate binds all 6 possible parameters, so there is no need to
+    // make the single-qubit pre-/post-gates canonical.
+    return flippedFromOriginal;
+  }
+  specialization = newSpecialization;
+
+  if (newSpecialization == Specialization::IdEquiv) {
+    // :math:`U \sim U_d(0,0,0)`
+    // Thus, :math:`\sim Id`
+    //
+    // This gate binds 0 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id` , :math:`K2_r = Id`.
+    a_ = 0.;
+    b_ = 0.;
+    c_ = 0.;
+    // unmodified global phase
+    k1l_ = k1l_ * k2l_;
+    k2l_ = Matrix2x2::identity();
+    k1r_ = k1r_ * k2r_;
+    k2r_ = Matrix2x2::identity();
+  } else if (newSpecialization == Specialization::SWAPEquiv) {
+    // :math:`U \sim U_d(\pi/4, \pi/4, \pi/4) \sim U(\pi/4, \pi/4, -\pi/4)`
+    // Thus, :math:`U \sim \text{SWAP}`
+    //
+    // This gate binds 0 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id` , :math:`K2_r = Id`.
+    if (c_ > 0.) {
+      // unmodified global phase
+      k1l_ = k1l_ * k2r_;
+      k1r_ = k1r_ * k2l_;
+      k2l_ = Matrix2x2::identity();
+      k2r_ = Matrix2x2::identity();
+    } else {
+      flippedFromOriginal = true;
+
+      globalPhase_ += (std::numbers::pi / 2.0);
+      k1l_ = k1l_ * ipz() * k2r_;
+      k1r_ = k1r_ * ipz() * k2l_;
+      k2l_ = Matrix2x2::identity();
+      k2r_ = Matrix2x2::identity();
+    }
+    a_ = (std::numbers::pi / 4.0);
+    b_ = (std::numbers::pi / 4.0);
+    c_ = (std::numbers::pi / 4.0);
+  } else if (newSpecialization == Specialization::PartialSWAPEquiv) {
+    // :math:`U \sim U_d(\alpha\pi/4, \alpha\pi/4, \alpha\pi/4)`
+    // Thus, :math:`U \sim \text{SWAP}^\alpha`
+    //
+    // This gate binds 3 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id`.
+    auto closest = closestPartialSwap(a_, b_, c_);
+    auto k2lDagger = k2l_.adjoint();
+
+    a_ = closest;
+    b_ = closest;
+    c_ = closest;
+    // unmodified global phase
+    k1l_ = k1l_ * k2l_;
+    k1r_ = k1r_ * k2l_;
+    k2r_ = k2lDagger * k2r_;
+    k2l_ = Matrix2x2::identity();
+  } else if (newSpecialization == Specialization::PartialSWAPFlipEquiv) {
+    // :math:`U \sim U_d(\alpha\pi/4, \alpha\pi/4, -\alpha\pi/4)`
+    // Thus, :math:`U \sim \text{SWAP}^\alpha`
+    //
+    // (a non-equivalent root of SWAP from the TwoQubitWeylPartialSWAPEquiv
+    // similar to how :math:`x = (\pm \sqrt(x))^2`)
+    //
+    // This gate binds 3 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Id`
+    auto closest = closestPartialSwap(a_, b_, -c_);
+    auto k2lDagger = k2l_.adjoint();
+
+    a_ = closest;
+    b_ = closest;
+    c_ = -closest;
+    // unmodified global phase
+    k1l_ = k1l_ * k2l_;
+    k1r_ = k1r_ * ipz() * k2l_ * ipz();
+    k2r_ = ipz() * k2lDagger * ipz() * k2r_;
+    k2l_ = Matrix2x2::identity();
+  } else if (newSpecialization == Specialization::ControlledEquiv) {
+    // :math:`U \sim U_d(\alpha, 0, 0)`
+    // Thus, :math:`U \sim \text{Ctrl-U}`
+    //
+    // This gate binds 4 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) Rx(\lambda_l)`
+    // :math:`K2_r = Ry(\theta_r) Rx(\lambda_r)`
+    const EulerBasis eulerBasis = EulerBasis::XYX;
+    const auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, eulerBasis);
+    const auto [k2rtheta, k2rphi, k2rlambda, k2rphase] =
+        anglesFromUnitary(k2r_, EulerBasis::XYX);
+    // unmodified parameter a
+    b_ = 0.;
+    c_ = 0.;
+    globalPhase_ = globalPhase_ + k2lphase + k2rphase;
+    k1l_ = k1l_ * rxMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rxMatrix(k2llambda);
+    k1r_ = k1r_ * rxMatrix(k2rphi);
+    k2r_ = ryMatrix(k2rtheta) * rxMatrix(k2rlambda);
+  } else if (newSpecialization == Specialization::MirrorControlledEquiv) {
+    // :math:`U \sim U_d(\pi/4, \pi/4, \alpha)`
+    // Thus, :math:`U \sim \text{SWAP} \cdot \text{Ctrl-U}`
+    //
+    // This gate binds 4 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l)\cdot Rz(\lambda_l)`
+    // :math:`K2_r = Ry(\theta_r)\cdot Rz(\lambda_r)`
+    const auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, EulerBasis::ZYZ);
+    const auto [k2rtheta, k2rphi, k2rlambda, k2rphase] =
+        anglesFromUnitary(k2r_, EulerBasis::ZYZ);
+    a_ = (std::numbers::pi / 4.0);
+    b_ = (std::numbers::pi / 4.0);
+    // unmodified parameter c
+    globalPhase_ = globalPhase_ + k2lphase + k2rphase;
+    k1l_ = k1l_ * rzMatrix(k2rphi);
+    k2l_ = ryMatrix(k2ltheta) * rzMatrix(k2llambda);
+    k1r_ = k1r_ * rzMatrix(k2lphi);
+    k2r_ = ryMatrix(k2rtheta) * rzMatrix(k2rlambda);
+  } else if (newSpecialization == Specialization::FSimaabEquiv) {
+    // :math:`U \sim U_d(\alpha, \alpha, \beta), \alpha \geq |\beta|`
+    //
+    // This gate binds 5 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) \cdot Rz(\lambda_l)`.
+    auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, EulerBasis::ZYZ);
+    auto ab = (a_ + b_) / 2.;
+
+    a_ = ab;
+    b_ = ab;
+    // unmodified parameter c
+    globalPhase_ = globalPhase_ + k2lphase;
+    k1l_ = k1l_ * rzMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rzMatrix(k2llambda);
+    k1r_ = k1r_ * rzMatrix(k2lphi);
+    k2r_ = rzMatrix(-k2lphi) * k2r_;
+  } else if (newSpecialization == Specialization::FSimabbEquiv) {
+    // :math:`U \sim U_d(\alpha, \beta, \beta), \alpha \geq \beta \geq 0`
+    //
+    // This gate binds 5 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) \cdot Rx(\lambda_l)`
+    auto eulerBasis = EulerBasis::XYX;
+    auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, eulerBasis);
+    auto bc = (b_ + c_) / 2.;
+
+    // unmodified parameter a
+    b_ = bc;
+    c_ = bc;
+    globalPhase_ = globalPhase_ + k2lphase;
+    k1l_ = k1l_ * rxMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rxMatrix(k2llambda);
+    k1r_ = k1r_ * rxMatrix(k2lphi);
+    k2r_ = rxMatrix(-k2lphi) * k2r_;
+  } else if (newSpecialization == Specialization::FSimabmbEquiv) {
+    // :math:`U \sim U_d(\alpha, \beta, -\beta), \alpha \geq \beta \geq 0`
+    //
+    // This gate binds 5 parameters, we make it canonical by setting:
+    //
+    // :math:`K2_l = Ry(\theta_l) \cdot Rx(\lambda_l)`
+    auto eulerBasis = EulerBasis::XYX;
+    auto [k2ltheta, k2lphi, k2llambda, k2lphase] =
+        anglesFromUnitary(k2l_, eulerBasis);
+    auto bc = (b_ - c_) / 2.;
+
+    // unmodified parameter a
+    b_ = bc;
+    c_ = -bc;
+    globalPhase_ = globalPhase_ + k2lphase;
+    k1l_ = k1l_ * rxMatrix(k2lphi);
+    k2l_ = ryMatrix(k2ltheta) * rxMatrix(k2llambda);
+    k1r_ = k1r_ * ipz() * rxMatrix(k2lphi) * ipz();
+    k2r_ = ipz() * rxMatrix(-k2lphi) * ipz() * k2r_;
+  } else {
+    llvm::reportFatalInternalError(
+        "Unknown specialization for Weyl decomposition!");
+  }
+  return flippedFromOriginal;
+}
+
+} // namespace mlir::qco::decomposition
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.cpp
new file mode 100644
index 0000000000..6ddbfb9580
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.cpp
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.h"
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Transforms/Passes.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/Casting.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+namespace mlir::qco {
+
+#define GEN_PASS_DEF_FUSETWOQUBITUNITARYRUNS
+#include "mlir/Dialect/QCO/Transforms/Passes.h.inc"
+
+} // namespace mlir::qco
+
+namespace mlir::qco::native_synth {
+
+namespace {
+
+/// State for one maximal two-qubit window (plus absorbed one-qubit ops)
+/// during consolidation.
+struct TwoQubitBlock {
+  Value wireA;
+  Value wireB;
+  llvm::SmallVector<Operation*, 8> ops;
+  Matrix4x4 accum = Matrix4x4::identity();
+  unsigned numTwoQ = 0;
+  unsigned numOneQ = 0;
+  bool anyNonNative = false;
+  bool open = true;
+};
+
+/// Tracks overlapping two-qubit windows on a module slice.
+struct TwoQubitWindowConsolidator {
+  std::vector<TwoQubitBlock> blocks;
+  llvm::DenseMap<Value, size_t> wireToBlock;
+
+  void closeBlock(size_t idx);
+  void closeBlockOnWire(Value v);
+  void process(Operation* op, const NativeProfileSpec& spec);
+  LogicalResult materialize(IRRewriter& rewriter,
+                            const NativeProfileSpec& spec);
+};
+
+/// Check whether a two-qubit op `op` is already expressible by the resolved
+/// native menu: a single-control `CX`/`CZ` consistent with the active
+/// entangler, or `Rzz` when `spec.allowRzz` is set. Multi-control and other
+/// two-qubit ops are considered non-native.
+bool isNativeTwoQubitOp(Operation* op, const NativeProfileSpec& spec) {
+  if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+    if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+      return false;
+    }
+    auto* body = ctrl.getBodyUnitary(0).getOperation();
+    if (llvm::isa<XOp>(body)) {
+      return usesCxEntangler(spec);
+    }
+    if (llvm::isa<ZOp>(body)) {
+      return usesCzEntangler(spec);
+    }
+    return false;
+  }
+  return spec.allowRzz && llvm::isa<RZZOp>(op);
+}
+
+/// Decide whether replacing a consolidated window is worthwhile. Always
+/// replace a window that contains any non-native op (we have to lower them
+/// anyway); otherwise only replace when the deterministic synthesizer uses
+/// strictly fewer entanglers than the window already contains.
+bool shouldApplyBlockReplacement(const TwoQubitBlock& block,
+                                 std::uint8_t numBasisUses) {
+  if (block.anyNonNative) {
+    return true;
+  }
+  return numBasisUses < block.numTwoQ;
+}
+
+LogicalResult materializeSingleTwoQubitBlock(IRRewriter& rewriter,
+                                             const TwoQubitBlock& block,
+                                             const NativeProfileSpec& spec) {
+  Operation* firstOp = block.ops.front();
+  auto firstUnitary = llvm::cast<UnitaryOpInterface>(firstOp);
+  const Value inA = firstUnitary.getInputQubit(0);
+  const Value inB = firstUnitary.getInputQubit(1);
+  const Value outA = block.wireA;
+  const Value outB = block.wireB;
+
+  rewriter.setInsertionPoint(firstOp);
+  Value newA;
+  Value newB;
+  if (failed(emitTwoQubitNative(rewriter, firstOp->getLoc(), inA, inB,
+                                block.accum, spec, newA, newB))) {
+    firstOp->emitError("failed to emit synthesized two-qubit gate sequence");
+    return failure();
+  }
+  rewriter.replaceAllUsesWith(outA, newA);
+  rewriter.replaceAllUsesWith(outB, newB);
+  for (auto* toErase : llvm::reverse(block.ops)) {
+    rewriter.eraseOp(toErase);
+  }
+  return success();
+}
+
+void TwoQubitWindowConsolidator::closeBlock(size_t idx) {
+  auto& block = blocks[idx];
+  if (!block.open) {
+    return;
+  }
+  block.open = false;
+  wireToBlock.erase(block.wireA);
+  if (block.wireB != block.wireA) {
+    wireToBlock.erase(block.wireB);
+  }
+}
+
+void TwoQubitWindowConsolidator::closeBlockOnWire(Value v) {
+  if (auto it = wireToBlock.find(v); it != wireToBlock.end()) {
+    closeBlock(it->second);
+  }
+}
+
+void TwoQubitWindowConsolidator::process(Operation* op,
+                                         const NativeProfileSpec& spec) {
+  if (op->getParentOfType<CtrlOp>()) {
+    return;
+  }
+  if (!llvm::isa<InvOp>(op) && op->getParentOfType<InvOp>()) {
+    return;
+  }
+  auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+  if (!unitary) {
+    return;
+  }
+  if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+    for (Value v : op->getOperands()) {
+      closeBlockOnWire(v);
+    }
+    return;
+  }
+
+  if (unitary.isTwoQubit()) {
+    Matrix4x4 opMatrix;
+    if (!getBlockTwoQubitMatrix(op, opMatrix)) {
+      closeBlockOnWire(unitary.getInputQubit(0));
+      closeBlockOnWire(unitary.getInputQubit(1));
+      return;
+    }
+    const Value v0 = unitary.getInputQubit(0);
+    const Value v1 = unitary.getInputQubit(1);
+    if (v0 == v1) {
+      closeBlockOnWire(v0);
+      return;
+    }
+    auto it0 = wireToBlock.find(v0);
+    auto it1 = wireToBlock.find(v1);
+    const bool tracked0 = it0 != wireToBlock.end();
+    const bool tracked1 = it1 != wireToBlock.end();
+    const std::optional<size_t> idx0 =
+        tracked0 ? std::optional(it0->second) : std::nullopt;
+    const std::optional<size_t> idx1 =
+        tracked1 ? std::optional(it1->second) : std::nullopt;
+    const bool sameBlock =
+        idx0.has_value() && idx1.has_value() && *idx0 == *idx1;
+    const bool singleUse = v0.hasOneUse() && v1.hasOneUse();
+
+    if (sameBlock && singleUse) {
+      const size_t idx = *idx0;
+      auto& block = blocks[idx];
+      llvm::SmallVector<decomposition::QubitId, 2> ids;
+      if (v0 == block.wireA && v1 == block.wireB) {
+        ids = {0, 1};
+      } else if (v0 == block.wireB && v1 == block.wireA) {
+        ids = {1, 0};
+      } else {
+        closeBlock(idx);
+        return;
+      }
+      block.accum = decomposition::fixTwoQubitMatrixQubitOrder(opMatrix, ids) *
+                    block.accum;
+      block.ops.push_back(op);
+      ++block.numTwoQ;
+      if (!isNativeTwoQubitOp(op, spec)) {
+        block.anyNonNative = true;
+      }
+      const Value eraseKeyA = it0->first;
+      const Value eraseKeyB = it1->first;
+      wireToBlock.erase(eraseKeyA);
+      if (eraseKeyA != eraseKeyB) {
+        wireToBlock.erase(eraseKeyB);
+      }
+      Value newA;
+      Value newB;
+      if (v0 == block.wireA) {
+        newA = unitary.getOutputQubit(0);
+        newB = unitary.getOutputQubit(1);
+      } else {
+        newA = unitary.getOutputQubit(1);
+        newB = unitary.getOutputQubit(0);
+      }
+      block.wireA = newA;
+      block.wireB = newB;
+      wireToBlock[newA] = idx;
+      wireToBlock[newB] = idx;
+      return;
+    }
+
+    if (idx0.has_value()) {
+      closeBlock(*idx0);
+    }
+    if (idx1.has_value() && (!idx0.has_value() || *idx0 != *idx1)) {
+      closeBlock(*idx1);
+    }
+    TwoQubitBlock nb;
+    nb.wireA = unitary.getOutputQubit(0);
+    nb.wireB = unitary.getOutputQubit(1);
+    nb.ops.push_back(op);
+    nb.numTwoQ = 1;
+    nb.accum = opMatrix;
+    nb.anyNonNative = !isNativeTwoQubitOp(op, spec);
+    const size_t idx = blocks.size();
+    blocks.push_back(std::move(nb));
+    wireToBlock[blocks[idx].wireA] = idx;
+    wireToBlock[blocks[idx].wireB] = idx;
+    return;
+  }
+
+  if (unitary.isSingleQubit()) {
+    const Value v = unitary.getInputQubit(0);
+    auto it = wireToBlock.find(v);
+    if (it == wireToBlock.end()) {
+      return;
+    }
+    const size_t idx = it->second;
+    auto& block = blocks[idx];
+    Matrix2x2 raw;
+    if (!unitary.getUnitaryMatrix2x2(raw) || !v.hasOneUse()) {
+      closeBlock(idx);
+      return;
+    }
+    const auto pad = (v == block.wireA)
+                         ? decomposition::expandToTwoQubits(raw, 0)
+                         : decomposition::expandToTwoQubits(raw, 1);
+    block.accum = pad * block.accum;
+    block.ops.push_back(op);
+    ++block.numOneQ;
+    if (!allowsSingleQubitOp(unitary, spec)) {
+      block.anyNonNative = true;
+    }
+    wireToBlock.erase(it);
+    if (v == block.wireA) {
+      block.wireA = unitary.getOutputQubit(0);
+      wireToBlock[block.wireA] = idx;
+    } else {
+      block.wireB = unitary.getOutputQubit(0);
+      wireToBlock[block.wireB] = idx;
+    }
+    return;
+  }
+
+  for (Value v : op->getOperands()) {
+    closeBlockOnWire(v);
+  }
+}
+
+LogicalResult
+TwoQubitWindowConsolidator::materialize(IRRewriter& rewriter,
+                                        const NativeProfileSpec& spec) {
+  llvm::DenseSet<Operation*> erasedOps;
+  for (const auto& block : blocks) {
+    if (block.ops.size() < 2) {
+      continue;
+    }
+    if (llvm::any_of(block.ops,
+                     [&](Operation* op) { return erasedOps.contains(op); })) {
+      continue;
+    }
+    const auto numBasisUses = twoQubitEntanglerCount(block.accum, spec);
+    if (!numBasisUses) {
+      continue;
+    }
+    if (!shouldApplyBlockReplacement(block, *numBasisUses)) {
+      continue;
+    }
+    if (failed(materializeSingleTwoQubitBlock(rewriter, block, spec))) {
+      return failure();
+    }
+    for (Operation* op : block.ops) {
+      erasedOps.insert(op);
+    }
+  }
+  return success();
+}
+
+} // namespace
+
+LogicalResult fuseTwoQubitUnitaryRuns(IRRewriter& rewriter, Operation* root,
+                                      const NativeProfileSpec& spec) {
+  llvm::SmallVector<Operation*, 32> ops;
+  collectUnitaryOpsInPreOrder(root, ops);
+  TwoQubitWindowConsolidator consolidator;
+  for (Operation* op : ops) {
+    consolidator.process(op, spec);
+  }
+  return consolidator.materialize(rewriter, spec);
+}
+
+namespace {
+
+struct FuseTwoQubitUnitaryRunsPass final
+    : impl::FuseTwoQubitUnitaryRunsBase<FuseTwoQubitUnitaryRunsPass> {
+  using Base::Base;
+
+  explicit FuseTwoQubitUnitaryRunsPass(FuseTwoQubitUnitaryRunsOptions options)
+      : Base(std::move(options)) {}
+
+protected:
+  void runOnOperation() override {
+    if (llvm::StringRef(nativeGates).trim().empty()) {
+      return;
+    }
+    auto specOpt = resolveNativeGatesSpec(nativeGates);
+    if (!specOpt) {
+      getOperation().emitError()
+          << "unsupported native gate menu (native-gates='" << nativeGates
+          << "')";
+      signalPassFailure();
+      return;
+    }
+    IRRewriter rewriter(&getContext());
+    if (failed(fuseTwoQubitUnitaryRuns(rewriter, getOperation(), *specOpt))) {
+      signalPassFailure();
+    }
+  }
+};
+
+} // namespace
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.cpp
new file mode 100644
index 0000000000..74c1e271d2
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h"
+
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/Support/ErrorHandling.h>
+
+#include <optional>
+
+namespace mlir::qco::native_synth {
+
+/// Map a single native-gate token (lower-case, no whitespace) to its
+/// `NativeGateKind`.
+static std::optional<NativeGateKind> parseGateToken(llvm::StringRef name) {
+  return llvm::StringSwitch<std::optional<NativeGateKind>>(name)
+      .Case("u", NativeGateKind::U)
+      .Case("x", NativeGateKind::X)
+      .Case("sx", NativeGateKind::Sx)
+      .Cases("rz", "p", NativeGateKind::Rz)
+      .Case("rx", NativeGateKind::Rx)
+      .Case("ry", NativeGateKind::Ry)
+      .Case("r", NativeGateKind::R)
+      .Case("cx", NativeGateKind::Cx)
+      .Case("cz", NativeGateKind::Cz)
+      .Case("rzz", NativeGateKind::Rzz)
+      .Default(std::nullopt);
+}
+
+/// Parse a comma-separated native-gate menu (e.g. `"u,cx,rzz"`) into the set
+/// of `NativeGateKind`s it names.
+static std::optional<llvm::DenseSet<NativeGateKind>>
+parseGateSet(llvm::StringRef nativeGates) {
+  llvm::DenseSet<NativeGateKind> gates;
+  llvm::SmallVector<llvm::StringRef> parts;
+  nativeGates.split(parts, ',', /*MaxSplit=*/-1, /*KeepEmpty=*/false);
+  for (llvm::StringRef part : parts) {
+    const auto token = part.trim().lower();
+    if (token.empty()) {
+      continue;
+    }
+    const auto gate = parseGateToken(token);
+    if (!gate) {
+      return std::nullopt;
+    }
+    gates.insert(*gate);
+  }
+  return gates;
+}
+
+/// Build a fully-resolved `SingleQubitEmitterSpec` for `mode`.
+static SingleQubitEmitterSpec
+makeEmitterSpec(SingleQubitMode mode, AxisPair axisPair = AxisPair::RxRz,
+                bool supportsDirectRx = false) {
+  return {
+      .mode = mode, .axisPair = axisPair, .supportsDirectRx = supportsDirectRx};
+}
+
+/// Append a new emitter for `(mode, axisPair, supportsDirectRx)` to
+/// `emitters` iff no equivalent entry is already present.
+static void
+addEmitterIfAbsent(llvm::SmallVectorImpl<SingleQubitEmitterSpec>& emitters,
+                   SingleQubitMode mode, AxisPair axisPair = AxisPair::RxRz,
+                   bool supportsDirectRx = false) {
+  const bool present = llvm::any_of(emitters, [&](const auto& e) {
+    return e.mode == mode && e.axisPair == axisPair &&
+           e.supportsDirectRx == supportsDirectRx;
+  });
+  if (!present) {
+    emitters.push_back(makeEmitterSpec(mode, axisPair, supportsDirectRx));
+  }
+}
+
+/// Enumerate the native gate kinds that `emitter` may actually emit.
+static llvm::SmallVector<NativeGateKind, 4>
+allowedGatesForEmitter(const SingleQubitEmitterSpec& emitter) {
+  switch (emitter.mode) {
+  case SingleQubitMode::ZSXX: {
+    llvm::SmallVector<NativeGateKind, 4> gates{
+        NativeGateKind::X, NativeGateKind::Sx, NativeGateKind::Rz};
+    if (emitter.supportsDirectRx) {
+      gates.push_back(NativeGateKind::Rx);
+    }
+    return gates;
+  }
+  case SingleQubitMode::U3:
+    return {NativeGateKind::U};
+  case SingleQubitMode::R:
+    return {NativeGateKind::R};
+  case SingleQubitMode::AxisPair:
+    switch (emitter.axisPair) {
+    case AxisPair::RxRz:
+      return {NativeGateKind::Rx, NativeGateKind::Rz};
+    case AxisPair::RxRy:
+      return {NativeGateKind::Rx, NativeGateKind::Ry};
+    case AxisPair::RyRz:
+      return {NativeGateKind::Ry, NativeGateKind::Rz};
+    }
+    break;
+  }
+  llvm_unreachable("unknown single-qubit mode");
+}
+
+/// Enumerate the native entangling gate kinds that `entangler` may emit.
+static llvm::SmallVector<NativeGateKind, 2>
+allowedGatesForEntangler(EntanglerBasis entangler) {
+  switch (entangler) {
+  case EntanglerBasis::None:
+    return {};
+  case EntanglerBasis::Cx:
+    return {NativeGateKind::Cx};
+  case EntanglerBasis::Cz:
+    return {NativeGateKind::Cz};
+  }
+  llvm_unreachable("unknown entangler basis");
+}
+
+/// Rebuild `spec.allowedGates` as the union of the gate kinds produced by
+/// every resolved emitter, entangler, and (optionally) `Rzz`.
+static void populateAllowedGates(NativeProfileSpec& spec) {
+  spec.allowedGates.clear();
+  for (const auto& emitter : spec.singleQubitEmitters) {
+    const auto allowed = allowedGatesForEmitter(emitter);
+    spec.allowedGates.insert(allowed.begin(), allowed.end());
+  }
+  for (const auto entangler : spec.entanglerBases) {
+    const auto allowed = allowedGatesForEntangler(entangler);
+    spec.allowedGates.insert(allowed.begin(), allowed.end());
+  }
+  if (spec.allowRzz) {
+    spec.allowedGates.insert(NativeGateKind::Rzz);
+  }
+}
+
+/// Euler basis reconstructing a two-axis single-qubit unitary for `axisPair`.
+static decomposition::EulerBasis eulerBasisForAxisPair(AxisPair axisPair) {
+  switch (axisPair) {
+  case AxisPair::RxRz:
+    return decomposition::EulerBasis::XZX;
+  case AxisPair::RxRy:
+    return decomposition::EulerBasis::XYX;
+  case AxisPair::RyRz:
+    return decomposition::EulerBasis::ZYZ;
+  }
+  llvm_unreachable("unknown axis pair");
+}
+
+decomposition::EulerBasis
+emitterEulerBasis(const SingleQubitEmitterSpec& emitter) {
+  switch (emitter.mode) {
+  case SingleQubitMode::ZSXX:
+    return decomposition::EulerBasis::ZSXX;
+  case SingleQubitMode::U3:
+    return decomposition::EulerBasis::U;
+  case SingleQubitMode::R:
+    // The R basis decomposes any 1Q unitary into an X-Y-X chain emitted
+    // directly as native R(theta, phi) gates (`Rx(a) == R(a, 0)`,
+    // `Ry(a) == R(a, pi/2)`).
+    return decomposition::EulerBasis::R;
+  case SingleQubitMode::AxisPair:
+    return eulerBasisForAxisPair(emitter.axisPair);
+  }
+  llvm_unreachable("unknown single-qubit mode");
+}
+
+std::optional<NativeProfileSpec>
+resolveNativeGatesSpec(llvm::StringRef nativeGates) {
+  const auto gates = parseGateSet(nativeGates);
+  if (!gates || gates->empty()) {
+    return std::nullopt;
+  }
+  const auto has = [&](NativeGateKind kind) { return gates->contains(kind); };
+
+  NativeProfileSpec spec;
+
+  // Derive all legal single-qubit emitters from the declared menu. Each
+  // emitter mode requires the *conjunction* of its constituent gate kinds
+  // to be on the menu -- for example, ZSXX needs X, Sx, and Rz all present,
+  // because the decomposer unconditionally emits all three. `supportsDirectRx`
+  // is an independent capability that enables a fast-path for `Rx(theta)`
+  // inputs when `Rx` is additionally available, but ZSXX itself does not
+  // depend on `Rx`.
+  if (has(NativeGateKind::U)) {
+    addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::U3);
+  }
+  const bool hasXSxRz = has(NativeGateKind::X) && has(NativeGateKind::Sx) &&
+                        has(NativeGateKind::Rz);
+  if (hasXSxRz) {
+    addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::ZSXX,
+                       AxisPair::RxRz,
+                       /*supportsDirectRx=*/has(NativeGateKind::Rx));
+  }
+  if (has(NativeGateKind::R)) {
+    addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::R);
+  }
+  struct AxisPairRule {
+    AxisPair axis;
+    NativeGateKind left;
+    NativeGateKind right;
+  };
+  for (const auto& rule : {
+           AxisPairRule{.axis = AxisPair::RxRz,
+                        .left = NativeGateKind::Rx,
+                        .right = NativeGateKind::Rz},
+           AxisPairRule{.axis = AxisPair::RxRy,
+                        .left = NativeGateKind::Rx,
+                        .right = NativeGateKind::Ry},
+           AxisPairRule{.axis = AxisPair::RyRz,
+                        .left = NativeGateKind::Ry,
+                        .right = NativeGateKind::Rz},
+       }) {
+    if (has(rule.left) && has(rule.right)) {
+      addEmitterIfAbsent(spec.singleQubitEmitters, SingleQubitMode::AxisPair,
+                         rule.axis);
+    }
+  }
+  if (spec.singleQubitEmitters.empty()) {
+    return std::nullopt;
+  }
+
+  if (has(NativeGateKind::Cx)) {
+    spec.entanglerBases.push_back(EntanglerBasis::Cx);
+  }
+  if (has(NativeGateKind::Cz)) {
+    spec.entanglerBases.push_back(EntanglerBasis::Cz);
+  }
+  spec.allowRzz = has(NativeGateKind::Rzz);
+
+  populateAllowedGates(spec);
+  return spec;
+}
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Pass.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Pass.cpp
new file mode 100644
index 0000000000..7ab87c95c1
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Pass.cpp
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/FuseTwoQubitUnitaryRuns.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Transforms/Passes.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/Pass.h>
+#include <mlir/Support/LLVM.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Support/WalkResult.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+namespace mlir::qco {
+#define GEN_PASS_DEF_NATIVEGATESYNTHESISPASS
+#include "mlir/Dialect/QCO/Transforms/Passes.h.inc"
+} // namespace mlir::qco
+
+namespace mlir::qco {
+
+using native_synth::allowsSingleQubitOp;
+using native_synth::canDirectlyDecomposeToAxisPair;
+using native_synth::canDirectlyDecomposeToR;
+using native_synth::canDirectlyDecomposeToU3;
+using native_synth::canDirectlyDecomposeToZSXX;
+using native_synth::collectUnitaryOpsInPreOrder;
+using native_synth::decomposeToAxisPair;
+using native_synth::decomposeToR;
+using native_synth::decomposeToU3;
+using native_synth::decomposeToZSXX;
+using native_synth::emitSingleQubitMatrix;
+using native_synth::emitterEulerBasis;
+using native_synth::emitTwoQubitNative;
+using native_synth::fuseTwoQubitUnitaryRuns;
+using native_synth::getBlockTwoQubitMatrix;
+using native_synth::NativeGateKind;
+using native_synth::NativeProfileSpec;
+using native_synth::resolveNativeGatesSpec;
+using native_synth::rewriteXXPlusMinusYYViaRzz;
+using native_synth::SingleQubitEmitterSpec;
+using native_synth::SingleQubitMode;
+using native_synth::usesCxEntangler;
+using native_synth::usesCzEntangler;
+
+namespace {
+
+/// Adjacent single-qubit unitaries on one wire considered for fusion.
+struct OneQubitRun {
+  llvm::SmallVector<UnitaryOpInterface, 4> ops;
+};
+
+} // namespace
+
+/// If profitable, replace the run with one synthesized single-qubit op in
+/// `basis` (mirrors `FuseSingleQubitUnitaryRuns`). Fuses when any op is
+/// off-menu or when Euler resynthesis strictly shortens the run.
+static bool maybeFuseRun(IRRewriter& rewriter, OneQubitRun& run,
+                         const decomposition::EulerBasis basis,
+                         const NativeProfileSpec& spec) {
+  Matrix2x2 fused = Matrix2x2::identity();
+  for (UnitaryOpInterface u : run.ops) {
+    Matrix2x2 m;
+    if (!u.getUnitaryMatrix2x2(m)) {
+      return false;
+    }
+    fused.premultiplyBy(m);
+  }
+
+  const bool anyNonNative = llvm::any_of(run.ops, [&](UnitaryOpInterface u) {
+    return !allowsSingleQubitOp(u, spec);
+  });
+
+  Operation* firstOp = run.ops.front().getOperation();
+  const Value inQubit = run.ops.front().getInputQubit(0);
+  const Value outQubit = run.ops.back().getOutputQubit(0);
+
+  rewriter.setInsertionPoint(firstOp);
+  const auto replacement = decomposition::synthesizeUnitary1QEuler(
+      rewriter, firstOp->getLoc(), inQubit, fused, run.ops.size(), anyNonNative,
+      basis);
+  if (!replacement) {
+    return false;
+  }
+  rewriter.replaceAllUsesWith(outQubit, *replacement);
+  for (auto& op : llvm::reverse(run.ops)) {
+    rewriter.eraseOp(op.getOperation());
+  }
+  return true;
+}
+
+/// True when `op` lives in a `ctrl`/`inv` region body (not the shell op).
+/// Skips nested unitaries so they are handled via the enclosing modifier.
+static bool isHiddenInsideCtrlOrInvBody(Operation* op) {
+  if (op->getParentOfType<CtrlOp>()) {
+    return true;
+  }
+  if (!llvm::isa<InvOp>(op) && op->getParentOfType<InvOp>()) {
+    return true;
+  }
+  return false;
+}
+
+/// Single-qubit op eligible for fusion (constant `2×2`, not under `ctrl`).
+static UnitaryOpInterface fusibleSingleQubitOp(Operation* op) {
+  auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+  if (!unitary || !unitary.isSingleQubit()) {
+    return {};
+  }
+  if (llvm::isa<BarrierOp, GPhaseOp, CtrlOp>(op)) {
+    return {};
+  }
+  if (isHiddenInsideCtrlOrInvBody(op)) {
+    return {};
+  }
+  Matrix2x2 matrix;
+  if (!unitary.getUnitaryMatrix2x2(matrix)) {
+    return {};
+  }
+  return unitary;
+}
+
+/// Whether `emitter` can lower the single-qubit `op` directly (used for ops
+/// with non-constant angles, which have no constant `2×2` matrix).
+static bool emitterHasDirectLowering(Operation* op,
+                                     const SingleQubitEmitterSpec& emitter) {
+  switch (emitter.mode) {
+  case SingleQubitMode::ZSXX:
+    return canDirectlyDecomposeToZSXX(op, emitter.supportsDirectRx);
+  case SingleQubitMode::U3:
+    return canDirectlyDecomposeToU3(op);
+  case SingleQubitMode::R:
+    return canDirectlyDecomposeToR(op);
+  case SingleQubitMode::AxisPair:
+    return canDirectlyDecomposeToAxisPair(op, emitter.axisPair);
+  }
+  return false;
+}
+
+/// Dispatch `op`'s direct (non-matrix) single-qubit lowering to the
+/// `decomposeTo*` helper for `emitter.mode`. Returns the output qubit value
+/// or a null `Value` if no direct rule applies for this op.
+static Value
+applyDirectSingleQubitLowering(IRRewriter& rewriter, Operation* op, Value in,
+                               const SingleQubitEmitterSpec& emitter) {
+  switch (emitter.mode) {
+  case SingleQubitMode::ZSXX:
+    return decomposeToZSXX(rewriter, op, in, emitter.supportsDirectRx);
+  case SingleQubitMode::U3:
+    return decomposeToU3(rewriter, op, in);
+  case SingleQubitMode::R:
+    return decomposeToR(rewriter, op, in);
+  case SingleQubitMode::AxisPair:
+    return decomposeToAxisPair(rewriter, op, in, emitter.axisPair);
+  }
+  llvm_unreachable("unknown SingleQubitMode");
+}
+
+namespace {
+
+/// Lowers unitary QCO ops to a comma-separated native gate menu using a
+/// deterministic, matrix-driven synthesizer: single-qubit fuse, two-qubit
+/// window consolidation, synthesis sweeps, seam single-qubit fuse, and
+/// optional cleanup sweeps.
+struct NativeGateSynthesisPass
+    : impl::NativeGateSynthesisPassBase<NativeGateSynthesisPass> {
+  /// Default-construct the pass with the TableGen-generated option defaults.
+  NativeGateSynthesisPass() = default;
+
+  /// Construct the pass from the TableGen-generated options struct (forwards
+  /// all option values into the base class).
+  explicit NativeGateSynthesisPass(
+      const NativeGateSynthesisPassOptions& options)
+      : NativeGateSynthesisPassBase(options) {}
+
+  /// Construct the pass from the public `NativeGateSynthesisOptions` struct
+  /// used by pipeline code that cannot include the TableGen-generated header.
+  explicit NativeGateSynthesisPass(const NativeGateSynthesisOptions& options) {
+    nativeGates = options.nativeGates;
+  }
+
+protected:
+  /// Top-level pass entry point. Resolves the native-gate menu, then drives
+  /// the staged rewrite pipeline: one-qubit run fusion, two-qubit window
+  /// consolidation, synthesis sweeps until the single-qubit surface is native,
+  /// seam cleanup, and a final fusion pass. Fails the pass on invalid input or
+  /// non-convergence.
+  void runOnOperation() override {
+    // Empty native-gates string: no-op.
+    if (llvm::StringRef(nativeGates).trim().empty()) {
+      return;
+    }
+    auto specOpt = resolveNativeGatesSpec(nativeGates);
+    if (!specOpt) {
+      getOperation().emitError()
+          << "unsupported native gate menu (native-gates='" << nativeGates
+          << "')";
+      signalPassFailure();
+      return;
+    }
+    const auto& spec = *specOpt;
+    // Deterministic single-qubit basis: the first emitter drives all matrix
+    // synthesis and run fusion.
+    const decomposition::EulerBasis oneQubitBasis =
+        emitterEulerBasis(spec.singleQubitEmitters.front());
+
+    IRRewriter rewriter(&getContext());
+
+    fuseOneQubitRuns(rewriter, spec, oneQubitBasis);
+    if (failed(consolidateTwoQubitBlocks(rewriter, spec))) {
+      signalPassFailure();
+      return;
+    }
+    // Two-qubit lowering can emit off-menu single-qubit ops (e.g. `rx`/`ry`);
+    // repeat until clean or hit the sweep cap before seam cleanup.
+    constexpr unsigned kMaxSynthesisSweeps = 4;
+    for (unsigned i = 0; i < kMaxSynthesisSweeps; ++i) {
+      if (failed(synthesizeRemainingOps(rewriter, spec, oneQubitBasis))) {
+        signalPassFailure();
+        return;
+      }
+      if (!hasNonNativeSingleQubitOps(spec)) {
+        break;
+      }
+    }
+    if (hasNonNativeSingleQubitOps(spec)) {
+      getOperation().emitError()
+          << "native gate synthesis did not converge within "
+          << kMaxSynthesisSweeps
+          << " sweeps (single-qubit ops remain outside the native menu)";
+      signalPassFailure();
+      return;
+    }
+    // Fuse single-qubit seams between two-qubit blocks.
+    fuseOneQubitRuns(rewriter, spec, oneQubitBasis);
+    // Re-check full menu (single-qubit ops, native `ctrl`, allowed bare `rzz`).
+    constexpr unsigned kPostMenuCleanupSweeps = 4;
+    unsigned postMenuSweepsRemaining = kPostMenuCleanupSweeps;
+    while (hasNonNativeMenuOps(spec) && postMenuSweepsRemaining-- > 0) {
+      if (failed(synthesizeRemainingOps(rewriter, spec, oneQubitBasis))) {
+        signalPassFailure();
+        return;
+      }
+      fuseOneQubitRuns(rewriter, spec, oneQubitBasis);
+    }
+    if (hasNonNativeMenuOps(spec)) {
+      getOperation().emitError()
+          << "native gate synthesis: operations remain outside the native menu "
+             "after final cleanup";
+      signalPassFailure();
+      return;
+    }
+  }
+
+  /// `CtrlOp` is already on-menu when the body is `X`/`Z` and the profile
+  /// supplies `cx` / `cz` entanglers.
+  static bool ctrlMatchesNativeMenu(CtrlOp ctrl,
+                                    const NativeProfileSpec& spec) {
+    if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+      return false;
+    }
+    Operation* body = ctrl.getBodyUnitary(0).getOperation();
+    const bool hasCX = llvm::isa<XOp>(body);
+    const bool hasCZ = llvm::isa<ZOp>(body);
+    if (!hasCX && !hasCZ) {
+      return false;
+    }
+    return (usesCxEntangler(spec) && hasCX) || (usesCzEntangler(spec) && hasCZ);
+  }
+
+  /// Bare two-qubit on-menu: `rzz` when the profile allows it.
+  static bool bareTwoQubitMatchesNativeMenu(Operation* op,
+                                            const NativeProfileSpec& spec) {
+    return llvm::isa<RZZOp>(op) && spec.allowRzz &&
+           spec.allowedGates.contains(NativeGateKind::Rzz);
+  }
+
+  /// True if any unitary is outside `spec` (single-qubit, `ctrl`, or bare
+  /// `rzz`).
+  bool hasNonNativeMenuOps(const NativeProfileSpec& spec) {
+    const mlir::WalkResult walkResult =
+        getOperation()->walk([&](Operation* op) {
+          if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+            return mlir::WalkResult::advance();
+          }
+          if (isHiddenInsideCtrlOrInvBody(op)) {
+            return mlir::WalkResult::advance();
+          }
+          if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+            if (!ctrlMatchesNativeMenu(ctrl, spec)) {
+              return mlir::WalkResult::interrupt();
+            }
+            return mlir::WalkResult::advance();
+          }
+          auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+          if (!unitary) {
+            return mlir::WalkResult::advance();
+          }
+          if (unitary.isSingleQubit()) {
+            if (!allowsSingleQubitOp(unitary, spec)) {
+              return mlir::WalkResult::interrupt();
+            }
+            return mlir::WalkResult::advance();
+          }
+          if (unitary.isTwoQubit()) {
+            if (!bareTwoQubitMatchesNativeMenu(op, spec)) {
+              return mlir::WalkResult::interrupt();
+            }
+            return mlir::WalkResult::advance();
+          }
+          return mlir::WalkResult::interrupt();
+        });
+    return walkResult.wasInterrupted();
+  }
+
+  /// Any off-menu single-qubit unitary (ignores `ctrl` region bodies).
+  bool hasNonNativeSingleQubitOps(const NativeProfileSpec& spec) {
+    const mlir::WalkResult walkResult =
+        getOperation()->walk([&](Operation* op) {
+          if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+            return mlir::WalkResult::advance();
+          }
+          if (isHiddenInsideCtrlOrInvBody(op)) {
+            return mlir::WalkResult::advance();
+          }
+          auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+          if (!unitary || !unitary.isSingleQubit()) {
+            return mlir::WalkResult::advance();
+          }
+          if (!allowsSingleQubitOp(unitary, spec)) {
+            return mlir::WalkResult::interrupt();
+          }
+          return mlir::WalkResult::advance();
+        });
+    return walkResult.wasInterrupted();
+  }
+
+private:
+  /// Fuse adjacent single-qubit runs when the emitter wins on length or any op
+  /// is off-menu.
+  void fuseOneQubitRuns(IRRewriter& rewriter, const NativeProfileSpec& spec,
+                        const decomposition::EulerBasis basis) {
+    llvm::SmallVector<OneQubitRun> runs;
+    llvm::DenseMap<Operation*, size_t> tailOpToRun;
+
+    // Extend the current run only when this op consumes the run's *tail*
+    // output with no other uses: both the `tailOpToRun` lookup and
+    // `inQubit.hasOneUse()` are required. Without the single-use check a run
+    // could fuse gates on a wire that also feeds another path (fan-out),
+    // which would silently drop the sibling user.
+    getOperation()->walk([&](Operation* op) {
+      auto unitary = fusibleSingleQubitOp(op);
+      if (!unitary) {
+        return;
+      }
+      Value inQubit = unitary.getInputQubit(0);
+      Operation* defOp = inQubit.getDefiningOp();
+      auto it =
+          (defOp != nullptr) ? tailOpToRun.find(defOp) : tailOpToRun.end();
+      const bool canExtend = it != tailOpToRun.end() && inQubit.hasOneUse();
+      if (canExtend) {
+        const size_t runIdx = it->second;
+        runs[runIdx].ops.push_back(unitary);
+        tailOpToRun.erase(it);
+        tailOpToRun[op] = runIdx;
+      } else {
+        runs.push_back(OneQubitRun{});
+        runs.back().ops.push_back(unitary);
+        tailOpToRun[op] = runs.size() - 1;
+      }
+    });
+
+    for (auto& run : runs) {
+      if (run.ops.size() < 2) {
+        continue;
+      }
+      (void)maybeFuseRun(rewriter, run, basis, spec);
+    }
+  }
+
+  /// Two-qubit windows with absorbed single-qubit ops: replace when a cheaper
+  /// native sequence exists.
+  LogicalResult consolidateTwoQubitBlocks(IRRewriter& rewriter,
+                                          const NativeProfileSpec& spec) {
+    return fuseTwoQubitUnitaryRuns(rewriter, getOperation(), spec);
+  }
+
+  /// One synthesis sweep over the whole function: rewrite every remaining
+  /// off-menu unitary by dispatching to `rewriteSingleQubit` /
+  /// `rewriteControlled` / `rewriteTwoQubit`. Returns `failure()` as soon as
+  /// any op cannot be lowered to the native menu. Safe to call repeatedly;
+  /// `runOnOperation` iterates until convergence.
+  LogicalResult synthesizeRemainingOps(IRRewriter& rewriter,
+                                       const NativeProfileSpec& spec,
+                                       const decomposition::EulerBasis basis) {
+    llvm::SmallVector<Operation*, 32> ops;
+    collectUnitaryOpsInPreOrder(getOperation(), ops);
+    llvm::DenseSet<Operation*> erasedOps;
+
+    for (Operation* op : ops) {
+      // Pointers were collected before this loop; avoid dereferencing ops
+      // erased by earlier rewrites in this same sweep.
+      if (erasedOps.contains(op)) {
+        continue;
+      }
+      // Nested regions under `ctrl` / `inv` are handled on the shell op
+      // (e.g. `ctrl { inv { ... } }`, `inv { ... }`).
+      if (isHiddenInsideCtrlOrInvBody(op)) {
+        continue;
+      }
+      if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+        continue;
+      }
+      auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+      if (!unitary) {
+        continue;
+      }
+
+      if (unitary.isSingleQubit()) {
+        if (!allowsSingleQubitOp(unitary, spec)) {
+          if (failed(rewriteSingleQubit(rewriter, op, unitary, spec, basis))) {
+            return failure();
+          }
+          erasedOps.insert(op);
+        }
+        continue;
+      }
+
+      if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+        const bool wasAlreadyNative = ctrlMatchesNativeMenu(ctrl, spec);
+        if (failed(rewriteControlled(rewriter, ctrl, spec))) {
+          return failure();
+        }
+        if (!wasAlreadyNative) {
+          erasedOps.insert(op);
+        }
+        continue;
+      }
+
+      if (unitary.isTwoQubit()) {
+        if (failed(rewriteTwoQubit(rewriter, op, unitary, spec))) {
+          return failure();
+        }
+        erasedOps.insert(op);
+        continue;
+      }
+    }
+    return success();
+  }
+
+  /// Lower one off-menu single-qubit `op`. Constant unitaries use the
+  /// matrix-driven Euler synthesizer in `basis`; ops with non-constant angles
+  /// fall back to the symbolic `decomposeTo*` lowering of the first emitter
+  /// that handles them.
+  static LogicalResult
+  rewriteSingleQubit(IRRewriter& rewriter, Operation* op,
+                     UnitaryOpInterface unitary, const NativeProfileSpec& spec,
+                     const decomposition::EulerBasis basis) {
+    rewriter.setInsertionPoint(op);
+    const Value in = unitary.getInputQubit(0);
+    Matrix2x2 matrix;
+    if (unitary.isSingleQubit() && unitary.getUnitaryMatrix2x2(matrix)) {
+      const Value replaced =
+          emitSingleQubitMatrix(rewriter, op->getLoc(), in, matrix, basis);
+      rewriter.replaceOp(op, replaced);
+      return success();
+    }
+    for (const auto& emitter : spec.singleQubitEmitters) {
+      if (!emitterHasDirectLowering(op, emitter)) {
+        continue;
+      }
+      if (const Value replaced =
+              applyDirectSingleQubitLowering(rewriter, op, in, emitter)) {
+        rewriter.replaceOp(op, replaced);
+        return success();
+      }
+    }
+    op->emitError("single-qubit operation not in selected native profile");
+    return failure();
+  }
+
+  /// Lower a single-control, single-target `CtrlOp` to the native profile.
+  /// Fast-path: already-native `CX`/`CZ` are kept as-is. Otherwise, lift the
+  /// controlled op to its 4x4 matrix and run the deterministic two-qubit
+  /// synthesizer.
+  static LogicalResult rewriteControlled(IRRewriter& rewriter, CtrlOp ctrl,
+                                         const NativeProfileSpec& spec) {
+    if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+      ctrl.emitError("native synthesis currently only supports 1-control "
+                     "1-target controlled gates");
+      return failure();
+    }
+    auto* body = ctrl.getBodyUnitary(0).getOperation();
+    const bool hasCX = llvm::isa<XOp>(body);
+    const bool hasCZ = llvm::isa<ZOp>(body);
+    if ((usesCxEntangler(spec) && hasCX) || (usesCzEntangler(spec) && hasCZ)) {
+      return success();
+    }
+    Matrix4x4 matrix;
+    if (hasCX || hasCZ) {
+      if (!getBlockTwoQubitMatrix(ctrl.getOperation(), matrix)) {
+        ctrl.emitError("failed to compute 4x4 matrix for CtrlOp");
+        return failure();
+      }
+    } else {
+      auto u = llvm::cast<UnitaryOpInterface>(ctrl.getOperation());
+      if (!u.isTwoQubit() || !u.getUnitaryMatrix4x4(matrix)) {
+        ctrl.emitError(
+            "native synthesis: cannot build a constant 4x4 matrix for this "
+            "controlled gate (unsupported body or non-constant parameters)");
+        return failure();
+      }
+    }
+    rewriter.setInsertionPoint(ctrl);
+    Value out0;
+    Value out1;
+    if (failed(emitTwoQubitNative(
+            rewriter, ctrl.getLoc(), ctrl.getInputControl(0),
+            ctrl.getInputTarget(0), matrix, spec, out0, out1))) {
+      ctrl.emitError("controlled gate not allowed by selected profile");
+      return failure();
+    }
+    rewriter.replaceOp(ctrl, ValueRange{out0, out1});
+    return success();
+  }
+
+  /// Lower an off-menu generic two-qubit op (`RZZ`, `XXPlusYY`, `XXMinusYY`,
+  /// or any arbitrary 4x4 unitary). Handles the `Rzz`-native fast path; for
+  /// `XXPlusYY` / `XXMinusYY` with `rzz` on the menu, uses the dedicated
+  /// `XX±YY -> Rzz` rewrite. All other two-qubit unitaries go through the
+  /// deterministic KAK synthesizer.
+  static LogicalResult rewriteTwoQubit(IRRewriter& rewriter, Operation* op,
+                                       UnitaryOpInterface unitary,
+                                       const NativeProfileSpec& spec) {
+    if (spec.allowRzz && llvm::isa<RZZOp>(op)) {
+      return success();
+    }
+    if (spec.allowRzz &&
+        (llvm::isa<XXPlusYYOp>(op) || llvm::isa<XXMinusYYOp>(op))) {
+      rewriter.setInsertionPoint(op);
+      if (succeeded(rewriteXXPlusMinusYYViaRzz(rewriter, op))) {
+        return success();
+      }
+      // Fall through to entangler-based synthesis when the dedicated rewrite
+      // could not be applied (e.g. no entangler-free realization).
+    }
+    Matrix4x4 matrix;
+    if (!getBlockTwoQubitMatrix(op, matrix)) {
+      op->emitError("unsupported two-qubit operation for selected profile");
+      return failure();
+    }
+    rewriter.setInsertionPoint(op);
+    Value out0;
+    Value out1;
+    if (failed(emitTwoQubitNative(
+            rewriter, op->getLoc(), unitary.getInputQubit(0),
+            unitary.getInputQubit(1), matrix, spec, out0, out1))) {
+      op->emitError("unsupported two-qubit operation for selected profile");
+      return failure();
+    }
+    rewriter.replaceOp(op, ValueRange{out0, out1});
+    return success();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass>
+createNativeGateSynthesisPass(const NativeGateSynthesisOptions& options) {
+  return std::make_unique<NativeGateSynthesisPass>(options);
+}
+
+} // namespace mlir::qco
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Policy.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Policy.cpp
new file mode 100644
index 0000000000..cc58c207e5
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Policy.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h"
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/IR/Operation.h>
+
+#include <optional>
+
+namespace mlir::qco::native_synth {
+
+bool usesCxEntangler(const NativeProfileSpec& spec) {
+  return llvm::is_contained(spec.entanglerBases, EntanglerBasis::Cx);
+}
+
+bool usesCzEntangler(const NativeProfileSpec& spec) {
+  return llvm::is_contained(spec.entanglerBases, EntanglerBasis::Cz);
+}
+
+/// Map a single-qubit `UnitaryOpInterface` op to the `NativeGateKind` that
+/// must appear in the menu for the op to be a no-op.
+static std::optional<NativeGateKind>
+singleQubitNativeGateKind(UnitaryOpInterface op) {
+  Operation* raw = op.getOperation();
+  if (llvm::isa<UOp>(raw)) {
+    return NativeGateKind::U;
+  }
+  if (llvm::isa<XOp>(raw)) {
+    return NativeGateKind::X;
+  }
+  if (llvm::isa<SXOp>(raw)) {
+    return NativeGateKind::Sx;
+  }
+  if (llvm::isa<RZOp, POp>(raw)) {
+    // `p` is a Z-rotation primitive for menu purposes.
+    return NativeGateKind::Rz;
+  }
+  if (llvm::isa<RXOp>(raw)) {
+    return NativeGateKind::Rx;
+  }
+  if (llvm::isa<RYOp>(raw)) {
+    return NativeGateKind::Ry;
+  }
+  if (llvm::isa<ROp>(raw)) {
+    return NativeGateKind::R;
+  }
+  return std::nullopt;
+}
+
+bool allowsSingleQubitOp(UnitaryOpInterface op, const NativeProfileSpec& spec) {
+  if (llvm::isa<BarrierOp, GPhaseOp>(op.getOperation())) {
+    return true;
+  }
+  const auto gate = singleQubitNativeGateKind(op);
+  return gate && spec.allowedGates.contains(*gate);
+}
+
+/// True when `decomposeTo*` should run instead of folding to a constant `2×2`
+/// matrix: trivial `Id`/`P`, dynamic-angle ops the matrix path cannot close
+/// over, and (for ZSXX with direct Rx) `Rx`/`Ry`/`R`. Static angles still use
+/// matrix + Euler.
+bool canDirectlyDecomposeToZSXX(Operation* op, bool supportsDirectRx) {
+  if (llvm::isa<IdOp, POp>(op)) {
+    return true;
+  }
+  return supportsDirectRx && llvm::isa<RXOp, RYOp, ROp>(op);
+}
+
+bool canDirectlyDecomposeToU3(Operation* op) {
+  return llvm::isa<IdOp, RXOp, RYOp, RZOp, POp, U2Op, ROp, UOp>(op);
+}
+
+bool canDirectlyDecomposeToR(Operation* op) {
+  return llvm::isa<IdOp, ROp, RXOp, RYOp>(op);
+}
+
+bool canDirectlyDecomposeToAxisPair(Operation* op, AxisPair axisPair) {
+  if (llvm::isa<IdOp>(op)) {
+    return true;
+  }
+  switch (axisPair) {
+  case AxisPair::RxRz:
+    // `p` on an Rx/Rz axis pair folds directly to `rz(theta)`.
+    return llvm::isa<RXOp, RZOp, POp>(op);
+  case AxisPair::RxRy:
+    // No cheap symbolic lowering of `p` without `rz` available.
+    return llvm::isa<RXOp, RYOp>(op);
+  case AxisPair::RyRz:
+    return llvm::isa<RYOp, RZOp, POp>(op);
+  }
+  llvm_unreachable("unknown axis pair");
+}
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.cpp
new file mode 100644
index 0000000000..32e26347d4
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.h"
+
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/ErrorHandling.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+
+#include <numbers>
+
+namespace mlir::qco::native_synth {
+
+constexpr double PI = std::numbers::pi;
+constexpr double HALF_PI = PI / 2.0;
+
+namespace {
+
+/// Small convenience wrapper to avoid passing rewriter/loc everywhere. Each
+/// method creates the corresponding QCO op threaded through `q` and returns
+/// its new output qubit value.
+struct SingleQubitEmitter {
+  IRRewriter* rewriter;
+  Location loc;
+
+  /// Create an `arith.constant` `f64` of value `v` at `loc`.
+  [[nodiscard]] Value constF(double v) const {
+    return createF64Const(*rewriter, loc, v);
+  }
+
+  /// Emit `rx(theta)` with a compile-time scalar angle.
+  [[nodiscard]] Value rx(Value q, double theta) const {
+    return RXOp::create(*rewriter, loc, q, constF(theta)).getOutputQubit(0);
+  }
+  /// Emit `rx(theta)` with a runtime `f64` angle value.
+  [[nodiscard]] Value rx(Value q, Value theta) const {
+    return RXOp::create(*rewriter, loc, q, theta).getOutputQubit(0);
+  }
+  /// Emit `ry(theta)` with a compile-time scalar angle.
+  [[nodiscard]] Value ry(Value q, double theta) const {
+    return RYOp::create(*rewriter, loc, q, constF(theta)).getOutputQubit(0);
+  }
+  /// Emit `ry(theta)` with a runtime `f64` angle value.
+  [[nodiscard]] Value ry(Value q, Value theta) const {
+    return RYOp::create(*rewriter, loc, q, theta).getOutputQubit(0);
+  }
+  /// Emit `rz(theta)` with a compile-time scalar angle.
+  [[nodiscard]] Value rz(Value q, double theta) const {
+    return RZOp::create(*rewriter, loc, q, constF(theta)).getOutputQubit(0);
+  }
+  /// Emit `rz(theta)` with a runtime `f64` angle value.
+  [[nodiscard]] Value rz(Value q, Value theta) const {
+    return RZOp::create(*rewriter, loc, q, theta).getOutputQubit(0);
+  }
+  /// Emit `sx` (square-root-of-X).
+  [[nodiscard]] Value sx(Value q) const {
+    return SXOp::create(*rewriter, loc, q).getOutputQubit(0);
+  }
+  /// Emit a Pauli `x`.
+  [[nodiscard]] Value x(Value q) const {
+    return XOp::create(*rewriter, loc, q).getOutputQubit(0);
+  }
+  /// Emit `r(theta, phi)` with compile-time scalar angles.
+  [[nodiscard]] Value r(Value q, double theta, double phi) const {
+    return ROp::create(*rewriter, loc, q, constF(theta), constF(phi))
+        .getOutputQubit(0);
+  }
+  /// Emit `r(theta, phi)` with runtime `f64` angle values.
+  [[nodiscard]] Value r(Value q, Value theta, Value phi) const {
+    return ROp::create(*rewriter, loc, q, theta, phi).getOutputQubit(0);
+  }
+  /// Emit `u(theta, phi, lambda)` with runtime `f64` angle values.
+  [[nodiscard]] Value u(Value q, Value theta, Value phi, Value lambda) const {
+    return UOp::create(*rewriter, loc, q, theta, phi, lambda).getOutputQubit(0);
+  }
+  /// Emit `u(theta, phi, lambda)` with compile-time scalar angles.
+  [[nodiscard]] Value u(Value q, double theta, double phi,
+                        double lambda) const {
+    return u(q, constF(theta), constF(phi), constF(lambda));
+  }
+};
+
+} // namespace
+
+Value decomposeToZSXX(IRRewriter& rewriter, Operation* op, Value inQubit,
+                      bool supportsDirectRx) {
+  if (llvm::isa<IdOp>(op)) {
+    return inQubit;
+  }
+  SingleQubitEmitter e{.rewriter = &rewriter, .loc = op->getLoc()};
+  if (auto p = llvm::dyn_cast<POp>(op)) {
+    auto q = e.rz(inQubit, p.getTheta());
+    auto halfTheta = arith::MulFOp::create(rewriter, op->getLoc(), p.getTheta(),
+                                           e.constF(0.5))
+                         .getResult();
+    GPhaseOp::create(rewriter, op->getLoc(), halfTheta);
+    return q;
+  }
+  if (!supportsDirectRx) {
+    return {};
+  }
+  if (auto rx = llvm::dyn_cast<RXOp>(op)) {
+    return rx.getOutputQubit(0);
+  }
+  if (auto ry = llvm::dyn_cast<RYOp>(op)) {
+    return e.rz(e.rx(e.rz(inQubit, -HALF_PI), ry.getTheta()), HALF_PI);
+  }
+  if (auto r = llvm::dyn_cast<ROp>(op)) {
+    auto negPhi =
+        arith::NegFOp::create(rewriter, op->getLoc(), r.getPhi()).getResult();
+    return e.rz(e.rx(e.rz(inQubit, negPhi), r.getTheta()), r.getPhi());
+  }
+  return {};
+}
+
+Value decomposeToU3(IRRewriter& rewriter, Operation* op, Value inQubit) {
+  if (llvm::isa<IdOp>(op)) {
+    return inQubit;
+  }
+  SingleQubitEmitter e{.rewriter = &rewriter, .loc = op->getLoc()};
+  if (auto u = llvm::dyn_cast<UOp>(op)) {
+    return u.getOutputQubit(0);
+  }
+  if (auto rx = llvm::dyn_cast<RXOp>(op)) {
+    return e.u(inQubit, rx.getTheta(), e.constF(-HALF_PI), e.constF(HALF_PI));
+  }
+  if (auto ry = llvm::dyn_cast<RYOp>(op)) {
+    return e.u(inQubit, ry.getTheta(), e.constF(0.0), e.constF(0.0));
+  }
+  if (auto rz = llvm::dyn_cast<RZOp>(op)) {
+    auto out = e.u(inQubit, e.constF(0.0), e.constF(0.0), rz.getTheta());
+    auto halfTheta = arith::MulFOp::create(rewriter, op->getLoc(),
+                                           rz.getTheta(), e.constF(-0.5))
+                         .getResult();
+    GPhaseOp::create(rewriter, op->getLoc(), halfTheta);
+    return out;
+  }
+  if (auto p = llvm::dyn_cast<POp>(op)) {
+    return e.u(inQubit, e.constF(0.0), e.constF(0.0), p.getTheta());
+  }
+  if (auto u2 = llvm::dyn_cast<U2Op>(op)) {
+    return e.u(inQubit, e.constF(HALF_PI), u2.getPhi(), u2.getLambda());
+  }
+  if (auto r = llvm::dyn_cast<ROp>(op)) {
+    auto loc = op->getLoc();
+    auto phiMinus =
+        arith::AddFOp::create(rewriter, loc, r.getPhi(), e.constF(-HALF_PI))
+            .getResult();
+    auto negPhi = arith::NegFOp::create(rewriter, loc, r.getPhi()).getResult();
+    auto minusPlus =
+        arith::AddFOp::create(rewriter, loc, negPhi, e.constF(HALF_PI))
+            .getResult();
+    return e.u(inQubit, r.getTheta(), phiMinus, minusPlus);
+  }
+  return {};
+}
+
+Value emitSingleQubitMatrix(IRRewriter& rewriter, Location loc, Value inQubit,
+                            const Matrix2x2& matrix,
+                            const decomposition::EulerBasis basis) {
+  // Force emission (`hasNonBasisGate = true`, `runSize = 0`) so the matrix is
+  // always lowered into native gates of `basis`, including any residual
+  // `qco.gphase`. With these arguments `synthesizeUnitary1QEuler` never
+  // returns `std::nullopt`.
+  return *decomposition::synthesizeUnitary1QEuler(
+      rewriter, loc, inQubit, matrix, /*runSize=*/0,
+      /*hasNonBasisGate=*/true, basis);
+}
+
+Value decomposeToR(IRRewriter& rewriter, Operation* op, Value inQubit) {
+  if (llvm::isa<IdOp>(op)) {
+    return inQubit;
+  }
+  SingleQubitEmitter e{.rewriter = &rewriter, .loc = op->getLoc()};
+  if (auto r = llvm::dyn_cast<ROp>(op)) {
+    return r.getOutputQubit(0);
+  }
+  if (auto rx = llvm::dyn_cast<RXOp>(op)) {
+    return e.r(inQubit, rx.getTheta(), e.constF(0.0));
+  }
+  if (auto ry = llvm::dyn_cast<RYOp>(op)) {
+    return e.r(inQubit, ry.getTheta(), e.constF(HALF_PI));
+  }
+  return {};
+}
+
+Value decomposeToAxisPair(IRRewriter& rewriter, Operation* op, Value inQubit,
+                          AxisPair axisPair) {
+  if (llvm::isa<IdOp>(op)) {
+    return inQubit;
+  }
+  SingleQubitEmitter e{.rewriter = &rewriter, .loc = op->getLoc()};
+  switch (axisPair) {
+  case AxisPair::RxRz:
+    if (auto rx = llvm::dyn_cast<RXOp>(op)) {
+      return rx.getOutputQubit(0);
+    }
+    if (auto rz = llvm::dyn_cast<RZOp>(op)) {
+      return rz.getOutputQubit(0);
+    }
+    if (auto p = llvm::dyn_cast<POp>(op)) {
+      auto q = e.rz(inQubit, p.getTheta());
+      auto halfTheta = arith::MulFOp::create(rewriter, op->getLoc(),
+                                             p.getTheta(), e.constF(0.5))
+                           .getResult();
+      GPhaseOp::create(rewriter, op->getLoc(), halfTheta);
+      return q;
+    }
+    return {};
+  case AxisPair::RxRy:
+    if (auto rx = llvm::dyn_cast<RXOp>(op)) {
+      return rx.getOutputQubit(0);
+    }
+    if (auto ry = llvm::dyn_cast<RYOp>(op)) {
+      return ry.getOutputQubit(0);
+    }
+    return {};
+  case AxisPair::RyRz:
+    if (auto ry = llvm::dyn_cast<RYOp>(op)) {
+      return ry.getOutputQubit(0);
+    }
+    if (auto rz = llvm::dyn_cast<RZOp>(op)) {
+      return rz.getOutputQubit(0);
+    }
+    if (auto p = llvm::dyn_cast<POp>(op)) {
+      auto q = e.rz(inQubit, p.getTheta());
+      auto halfTheta = arith::MulFOp::create(rewriter, op->getLoc(),
+                                             p.getTheta(), e.constF(0.5))
+                           .getResult();
+      GPhaseOp::create(rewriter, op->getLoc(), halfTheta);
+      return q;
+    }
+    return {};
+  }
+  llvm_unreachable("unknown axis pair");
+}
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.cpp
new file mode 100644
index 0000000000..226d14303b
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/TwoQubit.h"
+
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/SingleQubit.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <numbers>
+#include <optional>
+#include <utility>
+
+namespace mlir::qco::native_synth {
+
+constexpr double PI = std::numbers::pi;
+constexpr double HALF_PI = PI / 2.0;
+
+/// Deterministic entangler choice: prefer CX over CZ. Returns `std::nullopt`
+/// when the menu has no entangler basis.
+static std::optional<EntanglerBasis>
+selectEntangler(const NativeProfileSpec& spec) {
+  if (usesCxEntangler(spec)) {
+    return EntanglerBasis::Cx;
+  }
+  if (usesCzEntangler(spec)) {
+    return EntanglerBasis::Cz;
+  }
+  return std::nullopt;
+}
+
+/// 4x4 entangler matrix for `entangler` in MQT operand order (control on qubit
+/// 0 = MSB), matching `getBlockTwoQubitMatrix` / CX layout.
+static Matrix4x4 entanglerMatrix(EntanglerBasis entangler) {
+  return entangler == EntanglerBasis::Cz ? decomposition::czGate()
+                                         : decomposition::cxGate01();
+}
+
+/// Run the Weyl + basis decomposer for `target` against `entangler`, returning
+/// the raw single-qubit factors and entangler count (or `std::nullopt`).
+static std::optional<decomposition::TwoQubitNativeDecomposition>
+decomposeWithEntangler(const Matrix4x4& target, EntanglerBasis entangler) {
+  auto decomposer = decomposition::TwoQubitBasisDecomposer::create(
+      entanglerMatrix(entangler), 1.0);
+  auto weyl =
+      decomposition::TwoQubitWeylDecomposition::create(target, std::nullopt);
+  return decomposer.twoQubitDecompose(weyl, std::nullopt);
+}
+
+std::optional<std::uint8_t>
+twoQubitEntanglerCount(const Matrix4x4& target, const NativeProfileSpec& spec) {
+  const auto entangler = selectEntangler(spec);
+  if (!entangler) {
+    return std::nullopt;
+  }
+  const auto native = decomposeWithEntangler(target, *entangler);
+  if (!native) {
+    return std::nullopt;
+  }
+  return native->numBasisUses;
+}
+
+LogicalResult emitTwoQubitNative(IRRewriter& rewriter, Location loc,
+                                 Value qubit0, Value qubit1,
+                                 const Matrix4x4& target,
+                                 const NativeProfileSpec& spec,
+                                 Value& outQubit0, Value& outQubit1) {
+  const auto entangler = selectEntangler(spec);
+  if (!entangler) {
+    return failure();
+  }
+  const auto native = decomposeWithEntangler(target, *entangler);
+  if (!native) {
+    return failure();
+  }
+  const auto basis = emitterEulerBasis(spec.singleQubitEmitters.front());
+
+  // Residual global phase not represented by the factors / entanglers.
+  emitGPhaseIfNonTrivial(rewriter, loc, native->globalPhase);
+
+  Value wire0 = qubit0;
+  Value wire1 = qubit1;
+  const auto& factors = native->singleQubitFactors;
+  const std::uint8_t numBasisUses = native->numBasisUses;
+  const auto emitFactor = [&](Value& wire, std::size_t index) {
+    wire = emitSingleQubitMatrix(rewriter, loc, wire, factors[index], basis);
+  };
+  const auto emitEntangler = [&]() {
+    // The entangler acts with its control on wire 0 and target on wire 1.
+    auto ctrlOp = CtrlOp::create(
+        rewriter, loc, ValueRange{wire0}, ValueRange{wire1},
+        [&](ValueRange targetArgs) -> llvm::SmallVector<Value> {
+          if (*entangler == EntanglerBasis::Cz) {
+            return {
+                ZOp::create(rewriter, loc, targetArgs[0]).getOutputQubit(0)};
+          }
+          return {XOp::create(rewriter, loc, targetArgs[0]).getOutputQubit(0)};
+        });
+    wire0 = ctrlOp.getOutputControl(0);
+    wire1 = ctrlOp.getOutputTarget(0);
+  };
+
+  // factor[2i] on wire 1, factor[2i + 1] on wire 0, then one entangler.
+  for (std::uint8_t i = 0; i < numBasisUses; ++i) {
+    emitFactor(wire1, static_cast<std::size_t>(2 * i));
+    emitFactor(wire0, static_cast<std::size_t>((2 * i) + 1));
+    emitEntangler();
+  }
+  emitFactor(wire1, static_cast<std::size_t>(2 * numBasisUses));
+  emitFactor(wire0, static_cast<std::size_t>((2 * numBasisUses) + 1));
+
+  outQubit0 = wire0;
+  outQubit1 = wire1;
+  return success();
+}
+
+LogicalResult rewriteXXPlusMinusYYViaRzz(IRRewriter& rewriter, Operation* op) {
+  rewriter.setInsertionPoint(op);
+  const auto loc = op->getLoc();
+  const auto constF = [&](double v) {
+    return createF64Const(rewriter, loc, v);
+  };
+  const auto half = [&](Value v) -> Value {
+    if (auto c = getConstantF64(v)) {
+      return constF(*c * 0.5);
+    }
+    return arith::MulFOp::create(rewriter, loc, v, constF(0.5)).getResult();
+  };
+  const auto neg = [&](Value v) -> Value {
+    if (auto c = getConstantF64(v)) {
+      return constF(-*c);
+    }
+    return arith::NegFOp::create(rewriter, loc, v).getResult();
+  };
+  const auto emitH = [&](Value q) -> Value {
+    auto rz0 = RZOp::create(rewriter, loc, q, constF(HALF_PI));
+    auto sx = SXOp::create(rewriter, loc, rz0.getOutputQubit(0));
+    return RZOp::create(rewriter, loc, sx.getOutputQubit(0), constF(HALF_PI))
+        .getOutputQubit(0);
+  };
+  // Realize `Rxx(theta)` as `(H ⊗ H) * Rzz(theta) * (H ⊗ H)`: Hadamard
+  // conjugation maps the Z axis to X on each qubit, and the tensor-product
+  // identity `(H ⊗ H) * ZZ * (H ⊗ H) == XX` lifts that to the entangler.
+  const auto emitRxxViaRzz = [&](Value q0, Value q1,
+                                 Value theta) -> std::pair<Value, Value> {
+    q0 = emitH(q0);
+    q1 = emitH(q1);
+    auto rzz = RZZOp::create(rewriter, loc, q0, q1, theta);
+    q0 = rzz.getOutputQubit(0);
+    q1 = rzz.getOutputQubit(1);
+    return {emitH(q0), emitH(q1)};
+  };
+  // Realize `Ryy(theta)` as `(Rx(-pi/2) ⊗ Rx(-pi/2)) * Rzz(theta) *
+  // (Rx(pi/2) ⊗ Rx(pi/2))`: Rx(pi/2) maps Z to Y on each qubit, so the
+  // conjugation transports `ZZ` to `YY` just like the Hadamard sandwich
+  // above maps it to `XX`.
+  const auto emitRyyViaRzz = [&](Value q0, Value q1,
+                                 Value theta) -> std::pair<Value, Value> {
+    auto rx0 = RXOp::create(rewriter, loc, q0, constF(HALF_PI));
+    auto rx1 = RXOp::create(rewriter, loc, q1, constF(HALF_PI));
+    auto rzz = RZZOp::create(rewriter, loc, rx0.getOutputQubit(0),
+                             rx1.getOutputQubit(0), theta);
+    auto rxb0 =
+        RXOp::create(rewriter, loc, rzz.getOutputQubit(0), constF(-HALF_PI));
+    auto rxb1 =
+        RXOp::create(rewriter, loc, rzz.getOutputQubit(1), constF(-HALF_PI));
+    return {rxb0.getOutputQubit(0), rxb1.getOutputQubit(0)};
+  };
+
+  // `XXPlusYY(theta, beta)` and `XXMinusYY(theta, beta)` both act as
+  //   Rz(-beta) on q0 -> entangling core -> Rz(+beta) on q0,
+  // but differ in the entangling core:
+  //   XXPlusYY:  exp(-i * theta/4 * (XX + YY))  == Ryy(theta/2) * Rxx(theta/2)
+  //   XXMinusYY: exp(-i * theta/4 * (XX - YY))  == Rxx(theta/2) * Ryy(-theta/2)
+  // (XX and YY commute, so the two multiplication orders produce identical
+  // unitaries; the distinct order and sign below are what makes `XXMinusYY`
+  // the "minus" variant and must be preserved even though an order flip
+  // alone would also compile.)
+  if (auto xxPlus = llvm::dyn_cast<XXPlusYYOp>(op)) {
+    Value q0 = xxPlus.getInputQubit(0);
+    Value q1 = xxPlus.getInputQubit(1);
+    q0 = RZOp::create(rewriter, loc, q0, neg(xxPlus.getBeta()))
+             .getOutputQubit(0);
+    const auto halfTheta = half(xxPlus.getTheta());
+    std::tie(q0, q1) = emitRyyViaRzz(q0, q1, halfTheta);
+    std::tie(q0, q1) = emitRxxViaRzz(q0, q1, halfTheta);
+    q0 = RZOp::create(rewriter, loc, q0, xxPlus.getBeta()).getOutputQubit(0);
+    rewriter.replaceOp(op, ValueRange{q0, q1});
+    return success();
+  }
+  if (auto xxMinus = llvm::dyn_cast<XXMinusYYOp>(op)) {
+    Value q0 = xxMinus.getInputQubit(0);
+    Value q1 = xxMinus.getInputQubit(1);
+    q0 = RZOp::create(rewriter, loc, q0, neg(xxMinus.getBeta()))
+             .getOutputQubit(0);
+    const auto halfTheta = half(xxMinus.getTheta());
+    std::tie(q0, q1) = emitRxxViaRzz(q0, q1, halfTheta);
+    std::tie(q0, q1) = emitRyyViaRzz(q0, q1, neg(halfTheta));
+    q0 = RZOp::create(rewriter, loc, q0, xxMinus.getBeta()).getOutputQubit(0);
+    rewriter.replaceOp(op, ValueRange{q0, q1});
+    return success();
+  }
+  return failure();
+}
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Utils.cpp b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Utils.cpp
new file mode 100644
index 0000000000..8c7a6ce523
--- /dev/null
+++ b/mlir/lib/Dialect/QCO/Transforms/NativeSynthesis/Utils.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <llvm/ADT/APFloat.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/IR/BuiltinAttributes.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Support/WalkResult.h>
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <optional>
+
+namespace mlir::qco::native_synth {
+
+Value createF64Const(IRRewriter& rewriter, Location loc, double value) {
+  return arith::ConstantFloatOp::create(rewriter, loc, rewriter.getF64Type(),
+                                        llvm::APFloat(value))
+      .getResult();
+}
+
+std::optional<double> getConstantF64(Value value) {
+  if (auto constant = value.getDefiningOp<arith::ConstantFloatOp>()) {
+    if (auto floatAttr = llvm::dyn_cast<FloatAttr>(constant.getValue())) {
+      return floatAttr.getValueAsDouble();
+    }
+  }
+  return std::nullopt;
+}
+
+void emitGPhaseIfNonTrivial(IRRewriter& rewriter, Location loc, double phase) {
+  constexpr double epsilon = 1e-12;
+  if (std::abs(phase) > epsilon) {
+    GPhaseOp::create(rewriter, loc, createF64Const(rewriter, loc, phase));
+  }
+}
+
+bool getBlockTwoQubitMatrix(Operation* op, Matrix4x4& matrix) {
+  if (llvm::isa<BarrierOp, GPhaseOp>(op)) {
+    return false;
+  }
+  if (auto ctrl = llvm::dyn_cast<CtrlOp>(op)) {
+    if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+      return false;
+    }
+    auto* body = ctrl.getBodyUnitary(0).getOperation();
+    if (llvm::isa<XOp>(body)) {
+      // CX matrix in the same 4x4 basis layout as ``getUnitaryMatrix4x4``.
+      matrix = Matrix4x4::fromElements(1, 0, 0, 0, //
+                                       0, 1, 0, 0, //
+                                       0, 0, 0, 1, //
+                                       0, 0, 1, 0);
+      return true;
+    }
+    if (llvm::isa<ZOp>(body)) {
+      matrix = Matrix4x4::identity();
+      matrix(3, 3) = -1.0;
+      return true;
+    }
+    return false;
+  }
+  auto unitary = llvm::dyn_cast<UnitaryOpInterface>(op);
+  if (!unitary || !unitary.isTwoQubit()) {
+    return false;
+  }
+  Matrix4x4 raw;
+  if (!unitary.getUnitaryMatrix4x4(raw)) {
+    return false;
+  }
+  matrix = raw;
+  return true;
+}
+
+void collectUnitaryOpsInPreOrder(Operation* root,
+                                 llvm::SmallVectorImpl<Operation*>& ops) {
+  root->walk([&](Operation* op) {
+    if (op->getParentOfType<CtrlOp>()) {
+      return;
+    }
+    if (!llvm::isa<InvOp>(op) && op->getParentOfType<InvOp>()) {
+      return;
+    }
+    if (llvm::isa<UnitaryOpInterface>(op)) {
+      ops.push_back(op);
+    }
+  });
+}
+
+} // namespace mlir::qco::native_synth
diff --git a/mlir/lib/Dialect/QCO/Utils/Matrix.cpp b/mlir/lib/Dialect/QCO/Utils/Matrix.cpp
index 8df45840a3..bb5eedc63d 100644
--- a/mlir/lib/Dialect/QCO/Utils/Matrix.cpp
+++ b/mlir/lib/Dialect/QCO/Utils/Matrix.cpp
@@ -231,6 +231,10 @@ Matrix2x2 Matrix2x2::adjoint() const {
                       std::conj(data[1]), std::conj(data[3]));
 }
 
+Matrix2x2 Matrix2x2::transpose() const {
+  return fromElements(data[0], data[2], data[1], data[3]);
+}
+
 Complex Matrix2x2::trace() const { return data[0] + data[3]; }
 
 Complex Matrix2x2::determinant() const {
@@ -241,6 +245,10 @@ bool Matrix2x2::isApprox(const Matrix2x2& other, const double tol) const {
   return entriesAreApprox(data, other.data, tol);
 }
 
+bool Matrix2x2::isIdentity(const double tol) const {
+  return isApprox(fromElements(1.0, 0.0, 0.0, 1.0), tol);
+}
+
 bool Matrix2x2::assignFrom(const DynamicMatrix& src) {
   return assignFromDynamicImpl<K_ROWS, K_SIZE_AT_COMPILE_TIME>(src, data);
 }
@@ -296,6 +304,16 @@ Matrix4x4 Matrix4x4::adjoint() const {
   return out;
 }
 
+Matrix4x4 Matrix4x4::transpose() const {
+  Matrix4x4 out{};
+  for (std::size_t row = 0; row < K_ROWS; ++row) {
+    for (std::size_t col = 0; col < K_COLS; ++col) {
+      out.data[(col * K_COLS) + row] = data[(row * K_COLS) + col];
+    }
+  }
+  return out;
+}
+
 Complex Matrix4x4::trace() const {
   return data[0] + data[5] + data[10] + data[15];
 }
@@ -321,6 +339,58 @@ bool Matrix4x4::isApprox(const Matrix4x4& other, const double tol) const {
   return entriesAreApprox(data, other.data, tol);
 }
 
+bool Matrix4x4::isIdentity(const double tol) const {
+  Matrix4x4 id{};
+  for (std::size_t i = 0; i < K_ROWS; ++i) {
+    id.data[(i * K_COLS) + i] = 1.0;
+  }
+  return isApprox(id, tol);
+}
+
+std::array<Complex, Matrix4x4::K_ROWS> Matrix4x4::diagonal() const {
+  return {data[0], data[5], data[10], data[15]};
+}
+
+Matrix4x4
+Matrix4x4::fromDiagonal(const std::array<Complex, K_ROWS>& diagonalEntries) {
+  Matrix4x4 out{};
+  for (std::size_t i = 0; i < K_ROWS; ++i) {
+    out.data[(i * K_COLS) + i] = diagonalEntries[i];
+  }
+  return out;
+}
+
+std::array<Complex, Matrix4x4::K_ROWS>
+Matrix4x4::column(const std::size_t col) const {
+  return {data[col], data[K_COLS + col], data[(2 * K_COLS) + col],
+          data[(3 * K_COLS) + col]};
+}
+
+void Matrix4x4::setColumn(const std::size_t col,
+                          const std::array<Complex, K_ROWS>& values) {
+  for (std::size_t row = 0; row < K_ROWS; ++row) {
+    data[(row * K_COLS) + col] = values[row];
+  }
+}
+
+std::array<double, Matrix4x4::K_SIZE_AT_COMPILE_TIME>
+Matrix4x4::realPart() const {
+  std::array<double, K_SIZE_AT_COMPILE_TIME> out{};
+  for (std::size_t i = 0; i < K_SIZE_AT_COMPILE_TIME; ++i) {
+    out[i] = data[i].real();
+  }
+  return out;
+}
+
+std::array<double, Matrix4x4::K_SIZE_AT_COMPILE_TIME>
+Matrix4x4::imagPart() const {
+  std::array<double, K_SIZE_AT_COMPILE_TIME> out{};
+  for (std::size_t i = 0; i < K_SIZE_AT_COMPILE_TIME; ++i) {
+    out[i] = data[i].imag();
+  }
+  return out;
+}
+
 bool Matrix4x4::assignFrom(const DynamicMatrix& src) {
   return assignFromDynamicImpl<K_ROWS, K_SIZE_AT_COMPILE_TIME>(src, data);
 }
@@ -453,4 +523,103 @@ bool DynamicMatrix::isApprox(const DynamicMatrix& other,
   return entriesAreApprox(impl_->data, other.impl_->data, tol);
 }
 
+Matrix2x2 operator*(const Complex& scalar, const Matrix2x2& matrix) {
+  return matrix * scalar;
+}
+
+Matrix4x4 operator*(const Complex& scalar, const Matrix4x4& matrix) {
+  return matrix * scalar;
+}
+
+Matrix4x4 kron(const Matrix2x2& lhs, const Matrix2x2& rhs) {
+  Matrix4x4 out{};
+  for (std::size_t i = 0; i < Matrix2x2::K_ROWS; ++i) {
+    for (std::size_t j = 0; j < Matrix2x2::K_COLS; ++j) {
+      const Complex a = lhs(i, j);
+      for (std::size_t k = 0; k < Matrix2x2::K_ROWS; ++k) {
+        for (std::size_t l = 0; l < Matrix2x2::K_COLS; ++l) {
+          out((2 * i) + k, (2 * j) + l) = a * rhs(k, l);
+        }
+      }
+    }
+  }
+  return out;
+}
+
+SymmetricEigen4 jacobiSymmetricEigen(const std::array<double, 16>& symmetric) {
+  constexpr std::size_t n = 4;
+  constexpr int maxSweeps = 100;
+
+  std::array<double, 16> a = symmetric;
+  std::array<double, 16> v{};
+  for (std::size_t i = 0; i < n; ++i) {
+    v[(i * n) + i] = 1.0;
+  }
+
+  for (int sweep = 0; sweep < maxSweeps; ++sweep) {
+    double off = 0.0;
+    for (std::size_t p = 0; p < n; ++p) {
+      for (std::size_t q = p + 1; q < n; ++q) {
+        off += a[(p * n) + q] * a[(p * n) + q];
+      }
+    }
+    if (off <= 1e-30) {
+      break;
+    }
+
+    for (std::size_t p = 0; p < n; ++p) {
+      for (std::size_t q = p + 1; q < n; ++q) {
+        const double apq = a[(p * n) + q];
+        if (std::abs(apq) <= 1e-300) {
+          continue;
+        }
+        const double app = a[(p * n) + p];
+        const double aqq = a[(q * n) + q];
+        // Rotation angle that annihilates the (p, q) off-diagonal entry.
+        const double phi = 0.5 * std::atan2(2.0 * apq, aqq - app);
+        const double c = std::cos(phi);
+        const double s = std::sin(phi);
+
+        // Right-multiply by the Givens rotation: columns p and q.
+        for (std::size_t k = 0; k < n; ++k) {
+          const double akp = a[(k * n) + p];
+          const double akq = a[(k * n) + q];
+          a[(k * n) + p] = (c * akp) - (s * akq);
+          a[(k * n) + q] = (s * akp) + (c * akq);
+        }
+        // Left-multiply by the transposed rotation: rows p and q.
+        for (std::size_t k = 0; k < n; ++k) {
+          const double apk = a[(p * n) + k];
+          const double aqk = a[(q * n) + k];
+          a[(p * n) + k] = (c * apk) - (s * aqk);
+          a[(q * n) + k] = (s * apk) + (c * aqk);
+        }
+        // Accumulate the rotation into the eigenvector matrix.
+        for (std::size_t k = 0; k < n; ++k) {
+          const double vkp = v[(k * n) + p];
+          const double vkq = v[(k * n) + q];
+          v[(k * n) + p] = (c * vkp) - (s * vkq);
+          v[(k * n) + q] = (s * vkp) + (c * vkq);
+        }
+      }
+    }
+  }
+
+  std::array<double, 4> evals{a[0], a[5], a[10], a[15]};
+  std::array<std::size_t, 4> order{0, 1, 2, 3};
+  std::ranges::sort(order, [&evals](const std::size_t x, const std::size_t y) {
+    return evals[x] < evals[y];
+  });
+
+  SymmetricEigen4 result;
+  for (std::size_t j = 0; j < n; ++j) {
+    const std::size_t src = order[j];
+    result.eigenvalues[j] = evals[src];
+    for (std::size_t i = 0; i < n; ++i) {
+      result.eigenvectors(i, j) = Complex{v[(i * n) + src], 0.0};
+    }
+  }
+  return result;
+}
+
 } // namespace mlir::qco
diff --git a/mlir/tools/mqt-cc/CMakeLists.txt b/mlir/tools/mqt-cc/CMakeLists.txt
index adf0cd32d3..25adecd290 100644
--- a/mlir/tools/mqt-cc/CMakeLists.txt
+++ b/mlir/tools/mqt-cc/CMakeLists.txt
@@ -18,7 +18,8 @@ target_link_libraries(
           # Required for OpenQASM parsing
           MQT::CoreQASM
           MQT::CoreIR
-          MLIRQCTranslation)
+          MLIRQCTranslation
+          MLIRMemRefDialect)
 
 mqt_mlir_target_use_project_options(mqt-cc)
 llvm_update_compile_flags(mqt-cc)
diff --git a/mlir/tools/mqt-cc/mqt-cc.cpp b/mlir/tools/mqt-cc/mqt-cc.cpp
index cc13e2cdba..43625aa73a 100644
--- a/mlir/tools/mqt-cc/mqt-cc.cpp
+++ b/mlir/tools/mqt-cc/mqt-cc.cpp
@@ -91,6 +91,13 @@ static llvm::cl::opt<bool> enableHadamardLifting(
     llvm::cl::desc("Apply Hadamard lifting during optimization"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<std::string> nativeGates(
+    "native-gates",
+    llvm::cl::desc(
+        "Comma-separated native gate menu for the native-gate-synthesis "
+        "pass (empty or whitespace-only disables synthesis)"),
+    llvm::cl::value_desc("csv"), llvm::cl::init(""));
+
 /**
  * @brief Load and parse a .qasm file
  */
@@ -187,6 +194,7 @@ int main(int argc, char** argv) {
   config.disableMergeSingleQubitRotationGates =
       disableMergeSingleQubitRotationGates;
   config.enableHadamardLifting = enableHadamardLifting;
+  config.nativeGates = nativeGates.getValue();
 
   // Run the compilation pipeline
   CompilationRecord record;
@@ -207,7 +215,8 @@ int main(int argc, char** argv) {
                  << record.afterQCOConversion << "\n";
     llvm::outs() << "After Initial QCO Canonicalization:\n"
                  << record.afterQCOCanon << "\n";
-    llvm::outs() << "After Optimization:\n" << record.afterOptimization << "\n";
+    llvm::outs() << "After Optimization and Native Gate Synthesis:\n"
+                 << record.afterOptimization << "\n";
     llvm::outs() << "After Final QCO Canonicalization:\n"
                  << record.afterOptimizationCanon << "\n";
     llvm::outs() << "After QCO-to-QC Conversion:\n"
diff --git a/mlir/unittests/Compiler/test_compiler_pipeline.cpp b/mlir/unittests/Compiler/test_compiler_pipeline.cpp
index ddc3e4ce4d..8dab83823f 100644
--- a/mlir/unittests/Compiler/test_compiler_pipeline.cpp
+++ b/mlir/unittests/Compiler/test_compiler_pipeline.cpp
@@ -15,6 +15,11 @@
 #include "mlir/Dialect/QC/IR/QCDialect.h"
 #include "mlir/Dialect/QC/Translation/TranslateQuantumComputationToQC.h"
 #include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
 #include "mlir/Dialect/QIR/Builder/QIRProgramBuilder.h"
 #include "mlir/Dialect/QTensor/IR/QTensorDialect.h"
 #include "mlir/Support/IRVerification.h"
@@ -24,6 +29,9 @@
 #include "quantum_computation_programs.h"
 
 #include <gtest/gtest.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
 #include <mlir/Dialect/Arith/IR/Arith.h>
 #include <mlir/Dialect/ControlFlow/IR/ControlFlow.h>
 #include <mlir/Dialect/Func/IR/FuncOps.h>
@@ -34,12 +42,16 @@
 #include <mlir/IR/DialectRegistry.h>
 #include <mlir/IR/MLIRContext.h>
 #include <mlir/IR/OwningOpRef.h>
+#include <mlir/IR/Value.h>
 #include <mlir/IR/Verifier.h>
 #include <mlir/Parser/Parser.h>
+#include <mlir/Support/LogicalResult.h>
 
+#include <cstddef>
 #include <cstdlib>
 #include <iosfwd>
 #include <memory>
+#include <optional>
 #include <string>
 
 namespace mqt::test::compiler {
@@ -692,4 +704,250 @@ INSTANTIATE_TEST_SUITE_P(
                                  nullptr, MQT_NAMED_BUILDER(mlir::qc::ctrlTwo),
                                  MQT_NAMED_BUILDER(mlir::qir::ctrlTwo)}));
 
+namespace {
+
+class CompilerPipelineNativeSynthesisConfigTest : public testing::Test {
+protected:
+  std::unique_ptr<mlir::MLIRContext> context;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
+  mlir::QuantumCompilerConfig config;
+
+  void SetUp() override {
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::qc::QCDialect, mlir::qco::QCODialect,
+                    mlir::qtensor::QTensorDialect, mlir::arith::ArithDialect,
+                    mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,
+                    mlir::memref::MemRefDialect, mlir::scf::SCFDialect,
+                    mlir::LLVM::LLVMDialect>();
+    context = std::make_unique<mlir::MLIRContext>();
+    context->appendDialectRegistry(registry);
+    context->loadAllAvailableDialects();
+
+    module = mlir::qc::QCProgramBuilder::build(context.get(),
+                                               mlir::qc::staticQubitsWithOps);
+    ASSERT_TRUE(module);
+
+    config.recordIntermediates = true;
+  }
+
+  [[nodiscard]] mlir::CompilationRecord runPipelineAndExpectSuccess() const {
+    mlir::CompilationRecord record;
+    mlir::QuantumCompilerPipeline pipeline(config);
+    EXPECT_TRUE(pipeline.runPipeline(module.get(), &record).succeeded());
+    return record;
+  }
+
+  void runPipelineAndExpectFailure() const {
+    mlir::CompilationRecord record;
+    mlir::QuantumCompilerPipeline pipeline(config);
+    EXPECT_TRUE(mlir::failed(pipeline.runPipeline(module.get(), &record)));
+  }
+};
+
+} // namespace
+
+using mqt::test::isEquivalentUpToGlobalPhase;
+
+/// Compute the 4×4 unitary of a two-qubit QCO module whose qubits are
+/// introduced by `qco.static` ops with indices 0 and 1. Handles the op set
+/// that stage-4/stage-5 IR can contain for the `staticQubitsWithOps`
+/// program (pre-synthesis: `qco.h`; post-synthesis: `qco.rz`, `qco.sx`,
+/// `qco.x`, `qco.p`, `qco.u`; and `qco.gphase`, which is skipped). Returns
+/// `std::nullopt` if the IR contains an unsupported op or non-constant
+/// parameters.
+static std::optional<mlir::qco::Matrix4x4>
+computeStaticTwoQubitUnitary(mlir::ModuleOp module) {
+  if (module == nullptr) {
+    return std::nullopt;
+  }
+
+  mlir::qco::Matrix4x4 unitary = mlir::qco::Matrix4x4::identity();
+  llvm::DenseMap<mlir::Value, std::size_t> qubitIds;
+
+  const auto getQubitId = [&](mlir::Value qubit) -> std::optional<std::size_t> {
+    const auto it = qubitIds.find(qubit);
+    if (it == qubitIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (auto func : module.getOps<mlir::func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        if (auto staticOp = llvm::dyn_cast<mlir::qco::StaticOp>(&rawOp)) {
+          const auto index = static_cast<std::size_t>(staticOp.getIndex());
+          if (index >= 2) {
+            return std::nullopt;
+          }
+          qubitIds.try_emplace(staticOp.getResult(), index);
+          continue;
+        }
+
+        if (llvm::isa<mlir::qco::BarrierOp, mlir::qco::GPhaseOp>(&rawOp)) {
+          continue;
+        }
+
+        auto op = llvm::dyn_cast<mlir::qco::UnitaryOpInterface>(&rawOp);
+        if (!op) {
+          continue;
+        }
+
+        if (op.isSingleQubit()) {
+          const auto qid = getQubitId(op.getInputQubit(0));
+          if (!qid) {
+            return std::nullopt;
+          }
+          mlir::qco::Matrix2x2 oneQ;
+          if (!op.getUnitaryMatrix2x2(oneQ)) {
+            return std::nullopt;
+          }
+          unitary =
+              mlir::qco::decomposition::expandToTwoQubits(oneQ, *qid) * unitary;
+          qubitIds[op.getOutputQubit(0)] = *qid;
+          continue;
+        }
+
+        if (op.isTwoQubit()) {
+          const auto q0 = getQubitId(op.getInputQubit(0));
+          const auto q1 = getQubitId(op.getInputQubit(1));
+          if (!q0 || !q1) {
+            return std::nullopt;
+          }
+          mlir::qco::Matrix4x4 twoQ;
+          if (auto ctrl = llvm::dyn_cast<mlir::qco::CtrlOp>(&rawOp)) {
+            if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+              return std::nullopt;
+            }
+            auto* body = ctrl.getBodyUnitary(0).getOperation();
+            if (llvm::isa<mlir::qco::XOp>(body)) {
+              // CX matrix (same 4×4 layout as QCO unitary interface).
+              twoQ = mlir::qco::Matrix4x4::fromElements(1, 0, 0, 0, 0, 1, 0, 0,
+                                                        0, 0, 0, 1, 0, 0, 1, 0);
+            } else if (llvm::isa<mlir::qco::ZOp>(body)) {
+              // CZ matrix: identity with a `-1` phase on the `|11>` entry.
+              twoQ = mlir::qco::Matrix4x4::fromElements(
+                  1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, -1);
+            } else {
+              return std::nullopt;
+            }
+          } else if (!op.getUnitaryMatrix4x4(twoQ)) {
+            return std::nullopt;
+          }
+          const llvm::SmallVector<mlir::qco::decomposition::QubitId, 2> ids{
+              static_cast<mlir::qco::decomposition::QubitId>(*q0),
+              static_cast<mlir::qco::decomposition::QubitId>(*q1)};
+          unitary =
+              mlir::qco::decomposition::fixTwoQubitMatrixQubitOrder(twoQ, ids) *
+              unitary;
+          qubitIds[op.getOutputQubit(0)] = *q0;
+          qubitIds[op.getOutputQubit(1)] = *q1;
+          continue;
+        }
+
+        return std::nullopt;
+      }
+    }
+  }
+
+  return unitary;
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       AppliesConfiguredNativeSynthesisProfileInStage5) {
+  config.nativeGates = "x,sx,rz,cx";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  // Stage 4 still contains unsynthesized H operations from the source program.
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  // Stage 5 must rewrite them when a native menu is configured.
+  EXPECT_EQ(record.afterOptimization.find("qco.h"), std::string::npos);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       AppliesConfiguredU3CxNativeSynthesisProfileInStage5) {
+  config.nativeGates = "u,cx";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterOptimization.find("qco.h"), std::string::npos);
+  EXPECT_NE(record.afterOptimization.find("qco.u"), std::string::npos);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       AppliesConfiguredExpandedNativeSynthesisProfileInStage5) {
+  config.nativeGates = "u,rx,rz,cx,cz";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterOptimization.find("qco.h"), std::string::npos);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       RejectsUnderSpecifiedNativeSynthesisMenuInStage5) {
+  // A menu with only two-qubit entanglers cannot synthesize any single-qubit
+  // operation.
+  config.nativeGates = "cx,cz";
+
+  runPipelineAndExpectFailure();
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       RejectsInvalidNativeGateTokenInStage5) {
+  // Unknown tokens in the menu must be rejected.
+  config.nativeGates = "not-a-gate";
+
+  runPipelineAndExpectFailure();
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       LeavesIRUnchangedWhenNoNativeProfileIsConfigured) {
+  // Stage 5 must be a no-op when `nativeGates` is empty (the documented
+  // default): the stage-4 (QCO canonicalized) and stage-5 (optimization +
+  // native gate synthesis) IRs have to be byte-identical.
+  config.nativeGates = "";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterQCOCanon, record.afterOptimization);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       LeavesIRUnchangedWhenNativeGatesIsWhitespaceOnly) {
+  config.nativeGates = "   \t  ";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  EXPECT_NE(record.afterQCOCanon.find("qco.h"), std::string::npos);
+  EXPECT_EQ(record.afterQCOCanon, record.afterOptimization);
+}
+
+TEST_F(CompilerPipelineNativeSynthesisConfigTest,
+       NativeSynthesisPreservesUnitaryOnStaticQubits) {
+  // End-to-end unitary equivalence check: after the pipeline lowers
+  // `staticQubitsWithOps` (H on two static qubits) onto the `x,sx,rz,cx`
+  // native gate set, the 4×4 unitary of the IR after stage 5 must match the
+  // unitary of the pre-synthesis (`afterQCOCanon`) IR up to a global phase.
+  config.nativeGates = "x,sx,rz,cx";
+
+  const auto record = runPipelineAndExpectSuccess();
+
+  auto preSynth = mlir::parseSourceString<mlir::ModuleOp>(record.afterQCOCanon,
+                                                          context.get());
+  auto postSynth = mlir::parseSourceString<mlir::ModuleOp>(
+      record.afterOptimization, context.get());
+  ASSERT_TRUE(preSynth);
+  ASSERT_TRUE(postSynth);
+
+  const auto preU = computeStaticTwoQubitUnitary(preSynth.get());
+  const auto postU = computeStaticTwoQubitUnitary(postSynth.get());
+  ASSERT_TRUE(preU);
+  ASSERT_TRUE(postU);
+  EXPECT_TRUE(isEquivalentUpToGlobalPhase(*preU, *postU));
+}
+
 } // namespace mqt::test::compiler
diff --git a/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt b/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt
index d59780f461..163412b775 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt
+++ b/mlir/unittests/Dialect/QCO/Transforms/CMakeLists.txt
@@ -8,4 +8,5 @@
 
 add_subdirectory(Decomposition)
 add_subdirectory(Mapping)
+add_subdirectory(NativeSynthesis)
 add_subdirectory(Optimizations)
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt
index f493bb9e4d..21985a2967 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/CMakeLists.txt
@@ -7,12 +7,23 @@
 # Licensed under the MIT License
 
 set(target_name mqt-core-mlir-unittest-decomposition)
-add_executable(${target_name} test_euler_decomposition.cpp)
+add_executable(${target_name} test_euler_decomposition.cpp test_weyl_decomposition.cpp)
 
-target_link_libraries(${target_name} PRIVATE GTest::gtest_main MLIRQCOProgramBuilder
-                                             MLIRQCOTransforms)
-target_link_libraries(${target_name} PRIVATE MLIRPass MLIRFuncDialect MLIRArithDialect MLIRIR
-                                             MLIRSupport MLIRQTensorDialect)
+target_link_libraries(
+  ${target_name}
+  PRIVATE GTest::gtest_main
+          MLIRQCPrograms
+          MLIRQCOProgramBuilder
+          MLIRQCToQCO
+          MLIRQCOTransforms
+          MLIRQCOUtils
+          MLIRPass
+          MLIRFuncDialect
+          MLIRArithDialect
+          MLIRIR
+          MLIRSupport
+          MLIRQTensorDialect
+          LLVMSupport)
 
 mqt_mlir_configure_unittest_target(${target_name})
 
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/decomposition_test_utils.h b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/decomposition_test_utils.h
new file mode 100644
index 0000000000..e81b725308
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/decomposition_test_utils.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#pragma once
+
+#include "TestCaseUtils.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <random>
+#include <vector>
+
+namespace mlir::qco::decomposition_test {
+
+using mqt::test::isEquivalentUpToGlobalPhase;
+
+/// Standard `U3(theta, phi, lambda)` matrix. Thin wrapper over the library
+/// `uMatrix` so every test uses the same implementation.
+[[nodiscard]] inline Matrix2x2 u3Matrix(double theta, double phi,
+                                        double lambda) {
+  return decomposition::uMatrix(theta, phi, lambda);
+}
+
+namespace detail {
+
+/// Generate a Haar-ish random unitary as a row-major `dim x dim` buffer via
+/// modified Gram-Schmidt on Gaussian-random complex columns.
+[[nodiscard]] inline std::vector<std::complex<double>>
+randomUnitaryData(std::size_t dim, std::mt19937& rng) {
+  std::normal_distribution<double> normalDist(0.0, 1.0);
+  std::vector<std::vector<std::complex<double>>> columns(
+      dim, std::vector<std::complex<double>>(dim));
+  for (auto& column : columns) {
+    for (auto& entry : column) {
+      entry = std::complex<double>(normalDist(rng), normalDist(rng));
+    }
+  }
+  for (std::size_t j = 0; j < dim; ++j) {
+    for (std::size_t k = 0; k < j; ++k) {
+      std::complex<double> projection{0.0, 0.0};
+      for (std::size_t i = 0; i < dim; ++i) {
+        projection += std::conj(columns[k][i]) * columns[j][i];
+      }
+      for (std::size_t i = 0; i < dim; ++i) {
+        columns[j][i] -= projection * columns[k][i];
+      }
+    }
+    double norm = 0.0;
+    for (std::size_t i = 0; i < dim; ++i) {
+      norm += std::norm(columns[j][i]);
+    }
+    norm = std::sqrt(norm);
+    for (std::size_t i = 0; i < dim; ++i) {
+      columns[j][i] /= norm;
+    }
+  }
+  std::vector<std::complex<double>> data(dim * dim);
+  for (std::size_t row = 0; row < dim; ++row) {
+    for (std::size_t col = 0; col < dim; ++col) {
+      data[(row * dim) + col] = columns[col][row];
+    }
+  }
+  return data;
+}
+
+} // namespace detail
+
+/// Random `4×4` unitary matrix.
+[[nodiscard]] inline Matrix4x4 randomUnitary4x4(std::mt19937& rng) {
+  const auto data = detail::randomUnitaryData(4, rng);
+  const Matrix4x4 unitary = Matrix4x4::fromElements(
+      data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
+      data[8], data[9], data[10], data[11], data[12], data[13], data[14],
+      data[15]);
+  assert((unitary.adjoint() * unitary).isIdentity(1e-12));
+  return unitary;
+}
+
+} // namespace mlir::qco::decomposition_test
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp
index fa9ba1d4ea..67755f2d1f 100644
--- a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_euler_decomposition.cpp
@@ -264,6 +264,8 @@ template <typename OpTy>
     return countOps<UOp>(funcOp);
   case ZSXX:
     return countZSXXGates(funcOp);
+  case R:
+    return countOps<ROp>(funcOp);
   }
   return 0;
 }
@@ -472,6 +474,8 @@ TEST(EulerSynthesisTest, RandomReconstructionAllBases) {
     return isa<UOp>(op);
   case ZSXX:
     return isa<RZOp, SXOp, XOp>(op);
+  case R:
+    return isa<ROp>(op);
   }
   return false;
 }
diff --git a/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_weyl_decomposition.cpp b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_weyl_decomposition.cpp
new file mode 100644
index 0000000000..252ecdd9e1
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/Decomposition/test_weyl_decomposition.cpp
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "decomposition_test_utils.h"
+#include "mlir/Conversion/QCToQCO/QCToQCO.h"
+#include "mlir/Dialect/QC/Builder/QCProgramBuilder.h"
+#include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/BasisDecomposer.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Helpers.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/WeylDecomposition.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Transforms/Passes.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+
+#include <gtest/gtest.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/QC/IR/QCDialect.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/DialectRegistry.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/OwningOpRef.h>
+#include <mlir/IR/Value.h>
+#include <mlir/IR/Verifier.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/LogicalResult.h>
+
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <random>
+#include <tuple>
+
+using namespace mlir;
+using namespace mlir::qco;
+using namespace mlir::qco::decomposition;
+using namespace mlir::qco::decomposition_test;
+using namespace mlir::qco::helpers;
+using namespace mlir::qco::native_synth;
+
+// Weyl / basis / helpers.
+
+TEST(DecompositionHelpersTest, RemEuclidNeverNegative) {
+  EXPECT_DOUBLE_EQ(remEuclid(-1.0, 3.0), 2.0);
+  EXPECT_DOUBLE_EQ(remEuclid(7.0, 3.0), 1.0);
+  EXPECT_DOUBLE_EQ(remEuclid(0.0, 2.5), 0.0);
+}
+
+TEST(DecompositionHelpersTest, TraceToFidelityMatchesFormula) {
+  const std::complex<double> x{3.0, 4.0};
+  const double absx = 5.0;
+  EXPECT_DOUBLE_EQ(traceToFidelity(x), (4.0 + (absx * absx)) / 20.0);
+}
+
+TEST(DecompositionHelpersTest, GlobalPhaseFactorUnitMagnitude) {
+  const auto z = globalPhaseFactor(1.25);
+  EXPECT_NEAR(std::abs(z), 1.0, 1e-14);
+}
+
+TEST(DecompositionHelpersTest, IsUnitaryMatrixRejectsNonUnitary) {
+  const Matrix2x2 m = Matrix2x2::fromElements(2.0, 0.0, 0.0, 2.0);
+  EXPECT_FALSE(isUnitaryMatrix(m));
+}
+
+TEST(DecompositionHelpersTest, IsUnitaryMatrixAcceptsUnitary) {
+  const Matrix2x2 m = Matrix2x2::identity();
+  EXPECT_TRUE(isUnitaryMatrix(m));
+}
+
+//===----------------------------------------------------------------------===//
+// Weyl decomposition
+//===----------------------------------------------------------------------===//
+
+// NOLINTNEXTLINE(misc-use-internal-linkage) -- gtest `TEST_P` at global scope
+class WeylDecompositionTest : public testing::TestWithParam<Matrix4x4 (*)()> {
+public:
+  [[nodiscard]] static Matrix4x4
+  restore(const TwoQubitWeylDecomposition& decomposition) {
+    return k1(decomposition) * can(decomposition) * k2(decomposition) *
+           globalPhaseFactor(decomposition);
+  }
+
+  [[nodiscard]] static std::complex<double>
+  globalPhaseFactor(const TwoQubitWeylDecomposition& decomposition) {
+    return helpers::globalPhaseFactor(decomposition.globalPhase());
+  }
+  [[nodiscard]] static Matrix4x4
+  can(const TwoQubitWeylDecomposition& decomposition) {
+    return decomposition.getCanonicalMatrix();
+  }
+  [[nodiscard]] static Matrix4x4
+  k1(const TwoQubitWeylDecomposition& decomposition) {
+    return kron(decomposition.k1l(), decomposition.k1r());
+  }
+  [[nodiscard]] static Matrix4x4
+  k2(const TwoQubitWeylDecomposition& decomposition) {
+    return kron(decomposition.k2l(), decomposition.k2r());
+  }
+};
+
+TEST_P(WeylDecompositionTest, TestExact) {
+  const auto& originalMatrix = GetParam()();
+  auto decomposition = TwoQubitWeylDecomposition::create(
+      originalMatrix, std::optional<double>{1.0});
+  auto restoredMatrix = restore(decomposition);
+
+  EXPECT_TRUE(restoredMatrix.isApprox(originalMatrix));
+}
+
+TEST_P(WeylDecompositionTest, TestApproximation) {
+  const auto& originalMatrix = GetParam()();
+  auto decomposition = TwoQubitWeylDecomposition::create(
+      originalMatrix, std::optional<double>{1.0 - 1e-12});
+  auto restoredMatrix = restore(decomposition);
+
+  EXPECT_TRUE(restoredMatrix.isApprox(originalMatrix));
+}
+
+TEST(WeylDecompositionStandalone,
+     CnotProducesValidWeylParametersAndUnitaryLocals) {
+  const Matrix4x4 cnot = Matrix4x4::fromElements(1, 0, 0, 0, // row 0
+                                                 0, 1, 0, 0, // row 1
+                                                 0, 0, 0, 1, // row 2
+                                                 0, 0, 1, 0);
+
+  const auto decomp = TwoQubitWeylDecomposition::create(cnot, std::nullopt);
+  EXPECT_GE(decomp.a(), -1e-10);
+  EXPECT_GE(decomp.b(), -1e-10);
+  EXPECT_GE(decomp.c(), -1e-10);
+  constexpr double piOver4 = 0.7853981633974483;
+  EXPECT_LE(decomp.a(), piOver4 + 1e-10);
+  EXPECT_LE(decomp.b(), piOver4 + 1e-10);
+  EXPECT_LE(decomp.c(), piOver4 + 1e-10);
+  EXPECT_TRUE(helpers::isUnitaryMatrix(decomp.k1l()));
+  EXPECT_TRUE(helpers::isUnitaryMatrix(decomp.k2l()));
+  EXPECT_TRUE(helpers::isUnitaryMatrix(decomp.k1r()));
+  EXPECT_TRUE(helpers::isUnitaryMatrix(decomp.k2r()));
+}
+
+TEST(WeylDecompositionStandalone, Random) {
+  constexpr auto maxIterations = 5000;
+  std::mt19937 rng{1234567UL};
+
+  for (int i = 0; i < maxIterations; ++i) {
+    auto originalMatrix = randomUnitary4x4(rng);
+    auto decomposition = TwoQubitWeylDecomposition::create(
+        originalMatrix, std::optional<double>{1.0 - 1e-12});
+    auto restoredMatrix = WeylDecompositionTest::restore(decomposition);
+
+    // The reconstruction accuracy is bounded by the iterative diagonalization
+    // residual rather than the (much tighter) default matrix tolerance.
+    EXPECT_TRUE(
+        restoredMatrix.isApprox(originalMatrix, SANITY_CHECK_PRECISION));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ProductTwoQubitMatrices, WeylDecompositionTest,
+    ::testing::Values([]() -> Matrix4x4 { return Matrix4x4::identity(); },
+                      []() -> Matrix4x4 {
+                        return kron(rzMatrix(1.0), ryMatrix(3.1));
+                      },
+                      []() -> Matrix4x4 {
+                        return kron(Matrix2x2::identity(), rxMatrix(0.1));
+                      }));
+
+INSTANTIATE_TEST_SUITE_P(
+    TwoQubitMatrices, WeylDecompositionTest,
+    ::testing::Values(
+        []() -> Matrix4x4 { return rzzMatrix(2.0); },
+        []() -> Matrix4x4 {
+          return ryyMatrix(1.0) * rzzMatrix(3.0) * rxxMatrix(2.0);
+        },
+        []() -> Matrix4x4 {
+          return TwoQubitWeylDecomposition::getCanonicalMatrix(1.5, -0.2, 0.0) *
+                 kron(rxMatrix(1.0), Matrix2x2::identity());
+        },
+        []() -> Matrix4x4 {
+          return kron(rxMatrix(1.0), ryMatrix(1.0)) *
+                 TwoQubitWeylDecomposition::getCanonicalMatrix(1.1, 0.2, 3.0) *
+                 kron(rxMatrix(1.0), Matrix2x2::identity());
+        },
+        []() -> Matrix4x4 {
+          return kron(hGate(), ipz()) * cxGate01() * kron(ipx(), ipy());
+        }));
+
+//===----------------------------------------------------------------------===//
+// Basis decomposer
+//===----------------------------------------------------------------------===//
+
+// NOLINTNEXTLINE(misc-use-internal-linkage) -- gtest `TEST_P` at global scope
+class BasisDecomposerTest : public testing::TestWithParam<
+                                std::tuple<Matrix4x4 (*)(), Matrix4x4 (*)()>> {
+public:
+  [[nodiscard]] static Matrix4x4
+  restore(const TwoQubitNativeDecomposition& decomposition,
+          const Matrix4x4& entangler) {
+    const auto& factors = decomposition.singleQubitFactors;
+    const auto layer = [&](std::size_t i) {
+      return kron(factors[(2 * i) + 1], factors[2 * i]);
+    };
+    Matrix4x4 matrix = layer(0);
+    for (std::uint8_t i = 0; i < decomposition.numBasisUses; ++i) {
+      matrix = entangler * matrix;
+      matrix = layer(static_cast<std::size_t>(i) + 1) * matrix;
+    }
+    return matrix * helpers::globalPhaseFactor(decomposition.globalPhase);
+  }
+
+protected:
+  void SetUp() override {
+    basisMatrix = std::get<0>(GetParam())();
+    target = std::get<1>(GetParam())();
+    targetDecomposition = std::make_unique<TwoQubitWeylDecomposition>(
+        TwoQubitWeylDecomposition::create(target, std::optional<double>{1.0}));
+  }
+
+  Matrix4x4 target;
+  Matrix4x4 basisMatrix;
+  std::unique_ptr<TwoQubitWeylDecomposition> targetDecomposition;
+};
+
+TEST_P(BasisDecomposerTest, TestExact) {
+  const auto& originalMatrix = target;
+  auto decomposer = TwoQubitBasisDecomposer::create(basisMatrix, 1.0);
+  auto decomposed =
+      decomposer.twoQubitDecompose(*targetDecomposition, std::nullopt);
+
+  ASSERT_TRUE(decomposed.has_value());
+
+  auto restoredMatrix = restore(*decomposed, basisMatrix);
+
+  EXPECT_TRUE(restoredMatrix.isApprox(originalMatrix));
+}
+
+TEST_P(BasisDecomposerTest, TestApproximation) {
+  const auto& originalMatrix = target;
+  auto decomposer = TwoQubitBasisDecomposer::create(basisMatrix, 1.0 - 1e-12);
+  auto decomposed =
+      decomposer.twoQubitDecompose(*targetDecomposition, std::nullopt);
+
+  ASSERT_TRUE(decomposed.has_value());
+
+  auto restoredMatrix = restore(*decomposed, basisMatrix);
+
+  EXPECT_TRUE(restoredMatrix.isApprox(originalMatrix));
+}
+
+TEST(BasisDecomposerTest, Random) {
+  constexpr auto maxIterations = 2000;
+  std::mt19937 rng{123456UL};
+
+  const llvm::SmallVector<Matrix4x4, 2> basisMatrices{cxGate01(), cxGate10()};
+  std::uniform_int_distribution<std::size_t> distBasisGate{
+      0, basisMatrices.size() - 1};
+  auto selectRandomBasisMatrix = [&]() {
+    return basisMatrices[distBasisGate(rng)];
+  };
+
+  for (int i = 0; i < maxIterations; ++i) {
+    auto originalMatrix = randomUnitary4x4(rng);
+
+    auto targetDecomposition = TwoQubitWeylDecomposition::create(
+        originalMatrix, std::optional<double>{1.0});
+    const auto basisMatrix = selectRandomBasisMatrix();
+    auto decomposer = TwoQubitBasisDecomposer::create(basisMatrix, 1.0);
+    auto decomposed =
+        decomposer.twoQubitDecompose(targetDecomposition, std::nullopt);
+
+    ASSERT_TRUE(decomposed.has_value());
+
+    auto restoredMatrix =
+        BasisDecomposerTest::restore(*decomposed, basisMatrix);
+
+    // Reconstruction accumulates the Weyl diagonalization residual through up
+    // to three entangler layers, so allow a correspondingly relaxed tolerance.
+    EXPECT_TRUE(
+        restoredMatrix.isApprox(originalMatrix, SANITY_CHECK_PRECISION));
+  }
+}
+
+TEST(BasisDecomposerNumBasisTest, ForcesZeroBasisUsesForIdentityTarget) {
+  const auto basis = cxGate01();
+  const auto decomposer = TwoQubitBasisDecomposer::create(basis, 1.0);
+  const Matrix4x4 target = Matrix4x4::identity();
+  const auto weyl =
+      TwoQubitWeylDecomposition::create(target, std::optional<double>{1.0});
+  const auto decomposed = decomposer.twoQubitDecompose(weyl, std::uint8_t{0});
+  ASSERT_TRUE(decomposed.has_value());
+  EXPECT_EQ(decomposed->numBasisUses, 0);
+  const Matrix4x4 restored = BasisDecomposerTest::restore(*decomposed, basis);
+  EXPECT_TRUE(restored.isApprox(target));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ProductTwoQubitMatrices, BasisDecomposerTest,
+    testing::Combine(
+        // basis entanglers
+        testing::Values([]() -> Matrix4x4 { return cxGate01(); },
+                        []() -> Matrix4x4 { return cxGate10(); }),
+        // targets to be decomposed
+        testing::Values([]() -> Matrix4x4 { return Matrix4x4::identity(); },
+                        []() -> Matrix4x4 {
+                          return kron(rzMatrix(1.0), ryMatrix(3.1));
+                        },
+                        []() -> Matrix4x4 {
+                          return kron(Matrix2x2::identity(), rxMatrix(0.1));
+                        })));
+
+INSTANTIATE_TEST_SUITE_P(
+    TwoQubitMatrices, BasisDecomposerTest,
+    testing::Combine(
+        // basis entanglers
+        testing::Values([]() -> Matrix4x4 { return cxGate01(); },
+                        []() -> Matrix4x4 { return cxGate10(); }),
+        // targets to be decomposed
+        ::testing::Values(
+            []() -> Matrix4x4 { return rzzMatrix(2.0); },
+            []() -> Matrix4x4 {
+              return ryyMatrix(1.0) * rzzMatrix(3.0) * rxxMatrix(2.0);
+            },
+            []() -> Matrix4x4 {
+              return TwoQubitWeylDecomposition::getCanonicalMatrix(1.5, -0.2,
+                                                                   0.0) *
+                     kron(rxMatrix(1.0), Matrix2x2::identity());
+            },
+            []() -> Matrix4x4 {
+              return kron(rxMatrix(1.0), ryMatrix(1.0)) *
+                     TwoQubitWeylDecomposition::getCanonicalMatrix(1.1, 0.2,
+                                                                   3.0) *
+                     kron(rxMatrix(1.0), Matrix2x2::identity());
+            },
+            []() -> Matrix4x4 {
+              return kron(hGate(), ipz()) * cxGate01() * kron(ipx(), ipy());
+            })));
+
+namespace {
+
+[[nodiscard]] static std::optional<Value>
+getUnitaryQubitOperand(qco::UnitaryOpInterface op, std::size_t index) {
+  if (index >= op.getNumQubits()) {
+    return std::nullopt;
+  }
+  Value v = op->getOperand(index);
+  if (!llvm::isa<qco::QubitType>(v.getType())) {
+    return std::nullopt;
+  }
+  return v;
+}
+
+[[nodiscard]] static std::optional<Value>
+getUnitaryQubitResult(qco::UnitaryOpInterface op, std::size_t index) {
+  if (index >= op.getNumQubits()) {
+    return std::nullopt;
+  }
+  Value v = op->getResult(index);
+  if (!llvm::isa<qco::QubitType>(v.getType())) {
+    return std::nullopt;
+  }
+  return v;
+}
+
+static bool extractSingleQubitMatrix(qco::UnitaryOpInterface op,
+                                     Matrix2x2& out) {
+  if (op.getUnitaryMatrix2x2(out)) {
+    return true;
+  }
+  qco::DynamicMatrix dynamic;
+  if (!op.getUnitaryMatrixDynamic(dynamic) || dynamic.rows() != 2 ||
+      dynamic.cols() != 2) {
+    return false;
+  }
+  out = Matrix2x2::fromElements(dynamic(0, 0), dynamic(0, 1), dynamic(1, 0),
+                                dynamic(1, 1));
+  return true;
+}
+
+static bool extractTwoQubitMatrix(qco::UnitaryOpInterface op, Matrix4x4& out) {
+  if (getBlockTwoQubitMatrix(op.getOperation(), out)) {
+    return true;
+  }
+  return op.getUnitaryMatrix4x4(out);
+}
+
+static std::optional<Matrix4x4>
+computeTwoQubitUnitaryFromModule(const OwningOpRef<ModuleOp>& moduleOp) {
+  ModuleOp module = moduleOp.get();
+  if (!module) {
+    return std::nullopt;
+  }
+  Matrix4x4 unitary = Matrix4x4::identity();
+  llvm::DenseMap<Value, std::size_t> qubitIds;
+  std::size_t nextQubitId = 0;
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        if (auto alloc = llvm::dyn_cast<qco::AllocOp>(&rawOp)) {
+          if (nextQubitId >= 2) {
+            return std::nullopt;
+          }
+          qubitIds.try_emplace(alloc.getResult(), nextQubitId++);
+        }
+      }
+    }
+  }
+
+  auto getQubitId = [&](Value qubit) -> std::optional<std::size_t> {
+    auto it = qubitIds.find(qubit);
+    if (it == qubitIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        auto op = llvm::dyn_cast<qco::UnitaryOpInterface>(&rawOp);
+        if (!op) {
+          continue;
+        }
+        if (llvm::isa<qco::BarrierOp, qco::GPhaseOp>(op.getOperation())) {
+          continue;
+        }
+
+        if (op.isSingleQubit()) {
+          const auto qIn = getUnitaryQubitOperand(op, 0);
+          if (!qIn) {
+            return std::nullopt;
+          }
+          auto qid = getQubitId(*qIn);
+          if (!qid) {
+            return std::nullopt;
+          }
+          Matrix2x2 oneQ;
+          if (!extractSingleQubitMatrix(op, oneQ)) {
+            return std::nullopt;
+          }
+          unitary = decomposition::expandToTwoQubits(
+                        oneQ, static_cast<decomposition::QubitId>(*qid)) *
+                    unitary;
+          const auto qOut = getUnitaryQubitResult(op, 0);
+          if (!qOut) {
+            return std::nullopt;
+          }
+          qubitIds[*qOut] = *qid;
+          continue;
+        }
+
+        if (op.isTwoQubit()) {
+          const auto q0In = getUnitaryQubitOperand(op, 0);
+          const auto q1In = getUnitaryQubitOperand(op, 1);
+          if (!q0In || !q1In) {
+            return std::nullopt;
+          }
+          auto q0id = getQubitId(*q0In);
+          auto q1id = getQubitId(*q1In);
+          if (!q0id || !q1id) {
+            return std::nullopt;
+          }
+          Matrix4x4 twoQ;
+          if (!extractTwoQubitMatrix(op, twoQ)) {
+            return std::nullopt;
+          }
+          const llvm::SmallVector<decomposition::QubitId, 2> ids{
+              static_cast<decomposition::QubitId>(*q0id),
+              static_cast<decomposition::QubitId>(*q1id)};
+          unitary =
+              decomposition::fixTwoQubitMatrixQubitOrder(twoQ, ids) * unitary;
+          const auto q0Out = getUnitaryQubitResult(op, 0);
+          const auto q1Out = getUnitaryQubitResult(op, 1);
+          if (!q0Out || !q1Out) {
+            return std::nullopt;
+          }
+          qubitIds[*q0Out] = *q0id;
+          qubitIds[*q1Out] = *q1id;
+          continue;
+        }
+      }
+    }
+  }
+
+  if (nextQubitId != 2) {
+    return std::nullopt;
+  }
+  return unitary;
+}
+
+struct TwoQFuseFixture {
+  std::unique_ptr<MLIRContext> context;
+
+  void setUp() {
+    DialectRegistry registry;
+    registry.insert<mlir::qc::QCDialect, qco::QCODialect, arith::ArithDialect,
+                    func::FuncDialect>();
+    context = std::make_unique<MLIRContext>();
+    context->appendDialectRegistry(registry);
+    context->loadAllAvailableDialects();
+  }
+
+  [[nodiscard]] MLIRContext* ctx() const { return context.get(); }
+};
+
+static std::size_t countCtrlOps(const OwningOpRef<ModuleOp>& moduleOp) {
+  std::size_t count = 0;
+  moduleOp.get()->walk([&](qco::CtrlOp) { ++count; });
+  return count;
+}
+
+static LogicalResult runQcToQco(ModuleOp moduleOp) {
+  PassManager pm(moduleOp.getContext());
+  pm.addPass(mlir::createQCToQCO());
+  return pm.run(moduleOp);
+}
+
+static LogicalResult runTwoQFuse(ModuleOp moduleOp, StringRef nativeGates) {
+  PassManager pm(moduleOp.getContext());
+  pm.addPass(mlir::qco::createFuseTwoQubitUnitaryRuns(
+      mlir::qco::FuseTwoQubitUnitaryRunsOptions{
+          .nativeGates = nativeGates.str(),
+      }));
+  return pm.run(moduleOp);
+}
+
+template <typename ProgramT>
+static OwningOpRef<ModuleOp> buildProgram(MLIRContext* ctx, ProgramT program) {
+  return mlir::qc::QCProgramBuilder::build(ctx, program);
+}
+
+static void fusionCxCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.cx(q0, q1);
+}
+
+static void fusionHCxInterleavedTCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.cx(q0, q1);
+  b.t(q1);
+  b.s(q0);
+  b.cx(q0, q1);
+}
+
+static void fusionThreeLineCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  const auto q2 = b.allocQubit();
+  b.cx(q0, q1);
+  b.cx(q1, q2);
+  b.cx(q0, q1);
+}
+
+static void fusionCxBarrierCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.barrier({q0, q1});
+  b.cx(q0, q1);
+}
+
+static void fusionSwapCxPattern(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.cx(q1, q0);
+  b.cx(q0, q1);
+}
+
+static void fusionHRzzSRzz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.rzz(-0.29, q0, q1);
+  b.s(q1);
+  b.rzz(0.17, q0, q1);
+}
+
+template <typename ProgramT>
+static void expectTwoQFusePreservesUnitary(MLIRContext* ctx, ProgramT program,
+                                           StringRef nativeGates) {
+  auto expected = buildProgram(ctx, program);
+  ASSERT_TRUE(expected);
+  ASSERT_TRUE(succeeded(runQcToQco(*expected)));
+  const auto expectedUnitary = computeTwoQubitUnitaryFromModule(expected);
+  ASSERT_TRUE(expectedUnitary.has_value());
+
+  auto fused = buildProgram(ctx, program);
+  ASSERT_TRUE(fused);
+  ASSERT_TRUE(succeeded(runQcToQco(*fused)));
+  ASSERT_TRUE(succeeded(runTwoQFuse(*fused, nativeGates)));
+  ASSERT_TRUE(succeeded(verify(*fused)));
+  const auto fusedUnitary = computeTwoQubitUnitaryFromModule(fused);
+  ASSERT_TRUE(fusedUnitary.has_value());
+  EXPECT_TRUE(isEquivalentUpToGlobalPhase(*expectedUnitary, *fusedUnitary));
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// FuseTwoQubitUnitaryRuns tests
+//===----------------------------------------------------------------------===//
+
+TEST(FuseTwoQubitUnitaryRunsTest, InvalidNativeGatesFailsPass) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  auto module = buildProgram(fx.ctx(), fusionCxCx);
+  ASSERT_TRUE(module);
+  ASSERT_TRUE(succeeded(runQcToQco(*module)));
+  EXPECT_TRUE(failed(runTwoQFuse(*module, "not-a-gate")));
+}
+
+TEST(FuseTwoQubitUnitaryRunsTest, AdjacentCxCancel) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  expectTwoQFusePreservesUnitary(fx.ctx(), fusionCxCx, "u,cx");
+
+  auto module = buildProgram(fx.ctx(), fusionCxCx);
+  ASSERT_TRUE(module);
+  ASSERT_TRUE(succeeded(runQcToQco(*module)));
+  ASSERT_TRUE(succeeded(runTwoQFuse(*module, "u,cx")));
+  EXPECT_EQ(countCtrlOps(module), 0U);
+}
+
+TEST(FuseTwoQubitUnitaryRunsTest, FusesCxThroughInterleavedOneQOps) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  expectTwoQFusePreservesUnitary(fx.ctx(), fusionHCxInterleavedTCx, "u,cx");
+}
+
+TEST(FuseTwoQubitUnitaryRunsTest, StopsAtDifferentPairBoundary) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  auto module = buildProgram(fx.ctx(), fusionThreeLineCx);
+  ASSERT_TRUE(module);
+  ASSERT_TRUE(succeeded(runQcToQco(*module)));
+  ASSERT_TRUE(succeeded(runTwoQFuse(*module, "u,cx")));
+  EXPECT_GE(countCtrlOps(module), 1U);
+}
+
+TEST(FuseTwoQubitUnitaryRunsTest, DoesNotFuseAcrossBarrier) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  auto module = buildProgram(fx.ctx(), fusionCxBarrierCx);
+  ASSERT_TRUE(module);
+  ASSERT_TRUE(succeeded(runQcToQco(*module)));
+  ASSERT_TRUE(succeeded(runTwoQFuse(*module, "u,cx")));
+  EXPECT_EQ(countCtrlOps(module), 2U);
+}
+
+TEST(FuseTwoQubitUnitaryRunsTest, HandlesSwappedWireOrder) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  expectTwoQFusePreservesUnitary(fx.ctx(), fusionSwapCxPattern, "u,cx");
+}
+
+TEST(FuseTwoQubitUnitaryRunsTest, HandlesRzzBlock) {
+  TwoQFuseFixture fx;
+  fx.setUp();
+  expectTwoQFusePreservesUnitary(fx.ctx(), fusionHRzzSRzz, "x,sx,rz,rx,rzz,cz");
+}
diff --git a/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/CMakeLists.txt b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/CMakeLists.txt
new file mode 100644
index 0000000000..35463ee646
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+# Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+# All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+#
+# Licensed under the MIT License
+
+set(target_name mqt-core-mlir-unittest-native-synthesis)
+add_executable(${target_name} test_native_synthesis.cpp)
+
+target_link_libraries(
+  ${target_name}
+  PRIVATE MLIRParser
+          GTest::gtest_main
+          MLIRQCPrograms
+          MLIRQCOProgramBuilder
+          MLIRQCOUtils
+          MLIRQCToQCO
+          MLIRQCOTransforms
+          MLIRPass
+          MLIRSupport
+          LLVMSupport)
+
+mqt_mlir_configure_unittest_target(${target_name})
+
+gtest_discover_tests(${target_name} PROPERTIES LABELS mqt-mlir-unittests DISCOVERY_TIMEOUT 60)
diff --git a/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/test_native_synthesis.cpp b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/test_native_synthesis.cpp
new file mode 100644
index 0000000000..63ff826c61
--- /dev/null
+++ b/mlir/unittests/Dialect/QCO/Transforms/NativeSynthesis/test_native_synthesis.cpp
@@ -0,0 +1,1148 @@
+/*
+ * Copyright (c) 2023 - 2026 Chair for Design Automation, TUM
+ * Copyright (c) 2025 - 2026 Munich Quantum Software Company GmbH
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Licensed under the MIT License
+ */
+
+#include "TestCaseUtils.h"
+#include "mlir/Conversion/QCToQCO/QCToQCO.h"
+#include "mlir/Dialect/QCO/Builder/QCOProgramBuilder.h"
+#include "mlir/Dialect/QCO/IR/QCODialect.h"
+#include "mlir/Dialect/QCO/IR/QCOInterfaces.h"
+#include "mlir/Dialect/QCO/IR/QCOOps.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/Euler.h"
+#include "mlir/Dialect/QCO/Transforms/Decomposition/UnitaryMatrices.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/NativeSpec.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Policy.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Types.h"
+#include "mlir/Dialect/QCO/Transforms/NativeSynthesis/Utils.h"
+#include "mlir/Dialect/QCO/Transforms/Passes.h"
+#include "mlir/Dialect/QCO/Utils/Matrix.h"
+#include "qc_programs.h"
+
+#include <gtest/gtest.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/raw_ostream.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/QC/Builder/QCProgramBuilder.h>
+#include <mlir/Dialect/QC/IR/QCDialect.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/DialectRegistry.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/OwningOpRef.h>
+#include <mlir/IR/Value.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Support/WalkResult.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+using namespace mlir;
+using namespace mlir::qco;
+using namespace mlir::qco::decomposition;
+using namespace mlir::qco::native_synth;
+
+namespace mlir::qco::native_synth_test {
+
+using mqt::test::isEquivalentUpToGlobalPhase;
+
+/// Minimal dense, row-major, square complex matrix with runtime dimension.
+///
+/// Used by the multi-qubit equivalence checks (the synthesized circuits may
+/// span more than two wires, so the fixed-size `Matrix2x2`/`Matrix4x4` are not
+/// enough). Provides exactly the surface
+/// `mqt::test::isEquivalentUpToGlobalPhase` needs: `adjoint()`, `operator*`,
+/// scalar multiply, `trace()`, and `isApprox()`.
+class TestMatrix {
+public:
+  TestMatrix() = default;
+  explicit TestMatrix(std::size_t dim)
+      : dim_(dim), data_(dim * dim, std::complex<double>{0.0, 0.0}) {}
+
+  /// Identity matrix of dimension @p dim.
+  [[nodiscard]] static TestMatrix identity(std::size_t dim);
+  /// Promote a fixed `2×2` matrix to a `TestMatrix`.
+  [[nodiscard]] static TestMatrix fromMatrix2x2(const Matrix2x2& matrix);
+  /// Promote a fixed `4×4` matrix to a `TestMatrix`.
+  [[nodiscard]] static TestMatrix fromMatrix4x4(const Matrix4x4& matrix);
+
+  [[nodiscard]] std::size_t dim() const { return dim_; }
+
+  [[nodiscard]] std::complex<double>& operator()(std::size_t row,
+                                                 std::size_t col) {
+    return data_[(row * dim_) + col];
+  }
+  [[nodiscard]] std::complex<double> operator()(std::size_t row,
+                                                std::size_t col) const {
+    return data_[(row * dim_) + col];
+  }
+
+  /// Matrix product (dimensions must match).
+  [[nodiscard]] TestMatrix operator*(const TestMatrix& rhs) const;
+  /// Element-wise scaling by a complex scalar.
+  [[nodiscard]] TestMatrix operator*(std::complex<double> scalar) const;
+  /// Conjugate transpose.
+  [[nodiscard]] TestMatrix adjoint() const;
+  /// Sum of diagonal entries.
+  [[nodiscard]] std::complex<double> trace() const;
+  /// Entry-wise approximate equality (false on dimension mismatch).
+  [[nodiscard]] bool isApprox(const TestMatrix& other,
+                              double tol = 1e-10) const;
+
+private:
+  std::size_t dim_ = 0;
+  std::vector<std::complex<double>> data_;
+};
+
+/// Left scalar multiply, mirroring the right multiply above.
+[[nodiscard]] inline TestMatrix operator*(std::complex<double> scalar,
+                                          const TestMatrix& matrix) {
+  return matrix * scalar;
+}
+
+bool extractSingleQubitMatrix(qco::UnitaryOpInterface op, Matrix2x2& out);
+bool extractTwoQubitMatrix(qco::UnitaryOpInterface op, Matrix4x4& out);
+[[nodiscard]] std::optional<Matrix4x4>
+computeTwoQubitUnitaryFromModule(const OwningOpRef<ModuleOp>& moduleOp);
+[[nodiscard]] TestMatrix expandOneQToN(const Matrix2x2& matrix, std::size_t q,
+                                       std::size_t numQubits);
+[[nodiscard]] TestMatrix expandTwoQToN(const Matrix4x4& matrix, std::size_t q0,
+                                       std::size_t q1, std::size_t numQubits);
+[[nodiscard]] std::optional<TestMatrix>
+computeNQubitUnitaryFromModule(const OwningOpRef<ModuleOp>& moduleOp,
+                               std::size_t maxQubits = 6);
+
+/// One row of the standard multi-profile equivalence sweeps in tests.
+struct NativeSynthesisProfileSweepCase {
+  const char* nativeGates;
+  bool (*isNative)(mlir::OwningOpRef<mlir::ModuleOp>&);
+};
+
+/// Shared gtest fixture for native-gate synthesis pass tests.
+class NativeSynthesisPassTest : public testing::Test {
+protected:
+  void SetUp() override {
+    mlir::DialectRegistry registry;
+    registry.insert<mlir::qc::QCDialect, mlir::qco::QCODialect,
+                    mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                    mlir::memref::MemRefDialect>();
+    context = std::make_unique<mlir::MLIRContext>();
+    context->appendDialectRegistry(registry);
+    context->loadAllAvailableDialects();
+  }
+
+  template <typename... Allowed1QOps>
+  static bool onlyTheseOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp,
+                           const bool allowCx, const bool allowCz) {
+    bool ok = true;
+    std::ignore = moduleOp->walk([&](mlir::qco::UnitaryOpInterface op) {
+      mlir::Operation* raw = op.getOperation();
+      if (llvm::isa_and_present<mlir::qco::CtrlOp>(raw->getParentOp())) {
+        return mlir::WalkResult::advance();
+      }
+      if (llvm::isa<mlir::qco::BarrierOp, mlir::qco::GPhaseOp>(raw)) {
+        return mlir::WalkResult::advance();
+      }
+      if (auto ctrl = llvm::dyn_cast<mlir::qco::CtrlOp>(raw)) {
+        if (ctrl.getNumControls() != 1 || ctrl.getNumTargets() != 1) {
+          ok = false;
+          return mlir::WalkResult::interrupt();
+        }
+        mlir::Operation* body = ctrl.getBodyUnitary(0).getOperation();
+        const bool isCx = llvm::isa<mlir::qco::XOp>(body);
+        const bool isCz = llvm::isa<mlir::qco::ZOp>(body);
+        if ((isCx && allowCx) || (isCz && allowCz)) {
+          return mlir::WalkResult::advance();
+        }
+        ok = false;
+        return mlir::WalkResult::interrupt();
+      }
+
+      if (!llvm::isa<Allowed1QOps...>(raw)) {
+        ok = false;
+        return mlir::WalkResult::interrupt();
+      }
+      return mlir::WalkResult::advance();
+    });
+    return ok;
+  }
+
+  static bool onlyIbmBasicCxOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::XOp, mlir::qco::SXOp, mlir::qco::RZOp,
+                        mlir::qco::POp>(moduleOp, /*allowCx=*/true,
+                                        /*allowCz=*/false);
+  }
+
+  static bool onlyIbmBasicCzOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::XOp, mlir::qco::SXOp, mlir::qco::RZOp,
+                        mlir::qco::POp>(moduleOp, /*allowCx=*/false,
+                                        /*allowCz=*/true);
+  }
+
+  static bool onlyGenericU3CxOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::UOp>(moduleOp, /*allowCx=*/true,
+                                        /*allowCz=*/false);
+  }
+
+  static bool onlyGenericU3CzOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::UOp>(moduleOp, /*allowCx=*/false,
+                                        /*allowCz=*/true);
+  }
+
+  static bool onlyIqmDefaultOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::ROp>(moduleOp, /*allowCx=*/false,
+                                        /*allowCz=*/true);
+  }
+
+  static bool
+  onlyIbmFractionalOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::XOp, mlir::qco::SXOp, mlir::qco::RZOp,
+                        mlir::qco::POp, mlir::qco::RXOp, mlir::qco::RZZOp>(
+        moduleOp, /*allowCx=*/false, /*allowCz=*/true);
+  }
+
+  static bool
+  onlyAxisPairRxRzCxOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::RXOp, mlir::qco::RZOp, mlir::qco::POp>(
+        moduleOp, /*allowCx=*/true, /*allowCz=*/false);
+  }
+
+  static bool
+  onlyAxisPairRxRyCxOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::RXOp, mlir::qco::RYOp>(
+        moduleOp, /*allowCx=*/true, /*allowCz=*/false);
+  }
+
+  static bool
+  onlyAxisPairRyRzCzOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::RYOp, mlir::qco::RZOp, mlir::qco::POp>(
+        moduleOp, /*allowCx=*/false, /*allowCz=*/true);
+  }
+
+  static bool
+  onlyUOrAxisPairRxRzCxOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::UOp, mlir::qco::RXOp, mlir::qco::RZOp,
+                        mlir::qco::POp>(moduleOp, /*allowCx=*/true,
+                                        /*allowCz=*/false);
+  }
+
+  static bool
+  onlyGenericU3CxOrCzOps(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    return onlyTheseOps<mlir::qco::UOp>(moduleOp, /*allowCx=*/true,
+                                        /*allowCz=*/true);
+  }
+
+  static std::array<NativeSynthesisProfileSweepCase, 3>
+  coreEquivalenceProfiles() {
+    return {{{.nativeGates = "x,sx,rz,cx", .isNative = &onlyIbmBasicCxOps},
+             {.nativeGates = "u,cx", .isNative = &onlyGenericU3CxOps},
+             {.nativeGates = "r,cz", .isNative = &onlyIqmDefaultOps}}};
+  }
+
+  static void runNativeSynthesis(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp,
+                                 const std::string& nativeGates) {
+    mlir::PassManager pm(moduleOp->getContext());
+    pm.addPass(mlir::createQCToQCO());
+    pm.addPass(mlir::qco::createNativeGateSynthesisPass(
+        mlir::qco::NativeGateSynthesisOptions{
+            .nativeGates = nativeGates,
+        }));
+    ASSERT_TRUE(mlir::succeeded(pm.run(*moduleOp)));
+  }
+
+  static void runQcToQco(mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    mlir::PassManager pm(moduleOp->getContext());
+    pm.addPass(mlir::createQCToQCO());
+    ASSERT_TRUE(mlir::succeeded(pm.run(*moduleOp)));
+  }
+
+  static std::string
+  moduleToString(const mlir::OwningOpRef<mlir::ModuleOp>& moduleOp) {
+    std::string text;
+    llvm::raw_string_ostream os(text);
+    moduleOp.get()->print(os);
+    return text;
+  }
+
+  template <typename BuildFn, typename PredicateFn>
+  void expectNativeAfterSynthesis(BuildFn buildFn,
+                                  const std::string& nativeGates,
+                                  PredicateFn isNative) {
+    auto moduleOp = buildFn();
+    runNativeSynthesis(moduleOp, nativeGates);
+    EXPECT_TRUE(isNative(moduleOp));
+  }
+
+  template <typename BuildFn>
+  void expectSynthesisFailure(BuildFn buildFn, const std::string& nativeGates) {
+    auto moduleOp = buildFn();
+    mlir::PassManager pm(moduleOp->getContext());
+    pm.addPass(mlir::createQCToQCO());
+    pm.addPass(mlir::qco::createNativeGateSynthesisPass(
+        mlir::qco::NativeGateSynthesisOptions{
+            .nativeGates = nativeGates,
+        }));
+    EXPECT_TRUE(mlir::failed(pm.run(*moduleOp)));
+  }
+
+  template <typename BuildFn, typename PredicateFn, typename UnitaryFn>
+  void expectEquivalentAndNativeAfterSynthesis(BuildFn buildFn,
+                                               const std::string& nativeGates,
+                                               PredicateFn isNative,
+                                               UnitaryFn computeUnitary) {
+    auto expectedModule = buildFn();
+    runQcToQco(expectedModule);
+    const auto expectedUnitary = computeUnitary(expectedModule);
+    ASSERT_TRUE(expectedUnitary.has_value());
+
+    auto synthesizedModule = buildFn();
+    runNativeSynthesis(synthesizedModule, nativeGates);
+    EXPECT_TRUE(isNative(synthesizedModule));
+    const auto synthesizedUnitary = computeUnitary(synthesizedModule);
+    ASSERT_TRUE(synthesizedUnitary.has_value());
+    EXPECT_TRUE(
+        isEquivalentUpToGlobalPhase(*expectedUnitary, *synthesizedUnitary));
+  }
+
+  std::unique_ptr<mlir::MLIRContext> context;
+};
+
+TestMatrix TestMatrix::identity(std::size_t dim) {
+  TestMatrix result(dim);
+  for (std::size_t i = 0; i < dim; ++i) {
+    result(i, i) = std::complex<double>{1.0, 0.0};
+  }
+  return result;
+}
+
+TestMatrix TestMatrix::fromMatrix2x2(const Matrix2x2& matrix) {
+  TestMatrix result(2);
+  for (std::size_t row = 0; row < 2; ++row) {
+    for (std::size_t col = 0; col < 2; ++col) {
+      result(row, col) = matrix(row, col);
+    }
+  }
+  return result;
+}
+
+TestMatrix TestMatrix::fromMatrix4x4(const Matrix4x4& matrix) {
+  TestMatrix result(4);
+  for (std::size_t row = 0; row < 4; ++row) {
+    for (std::size_t col = 0; col < 4; ++col) {
+      result(row, col) = matrix(row, col);
+    }
+  }
+  return result;
+}
+
+TestMatrix TestMatrix::operator*(const TestMatrix& rhs) const {
+  TestMatrix result(dim_);
+  for (std::size_t row = 0; row < dim_; ++row) {
+    for (std::size_t k = 0; k < dim_; ++k) {
+      const std::complex<double> a = (*this)(row, k);
+      if (a == std::complex<double>{0.0, 0.0}) {
+        continue;
+      }
+      for (std::size_t col = 0; col < dim_; ++col) {
+        result(row, col) += a * rhs(k, col);
+      }
+    }
+  }
+  return result;
+}
+
+TestMatrix TestMatrix::operator*(std::complex<double> scalar) const {
+  TestMatrix result(dim_);
+  for (std::size_t row = 0; row < dim_; ++row) {
+    for (std::size_t col = 0; col < dim_; ++col) {
+      result(row, col) = (*this)(row, col) * scalar;
+    }
+  }
+  return result;
+}
+
+TestMatrix TestMatrix::adjoint() const {
+  TestMatrix result(dim_);
+  for (std::size_t row = 0; row < dim_; ++row) {
+    for (std::size_t col = 0; col < dim_; ++col) {
+      result(col, row) = std::conj((*this)(row, col));
+    }
+  }
+  return result;
+}
+
+std::complex<double> TestMatrix::trace() const {
+  std::complex<double> sum{0.0, 0.0};
+  for (std::size_t i = 0; i < dim_; ++i) {
+    sum += (*this)(i, i);
+  }
+  return sum;
+}
+
+bool TestMatrix::isApprox(const TestMatrix& other, double tol) const {
+  if (dim_ != other.dim_) {
+    return false;
+  }
+  for (std::size_t row = 0; row < dim_; ++row) {
+    for (std::size_t col = 0; col < dim_; ++col) {
+      if (std::abs((*this)(row, col) - other(row, col)) > tol) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+[[nodiscard]] static std::optional<Value>
+getUnitaryQubitOperand(qco::UnitaryOpInterface op, std::size_t index) {
+  if (index >= op.getNumQubits()) {
+    return std::nullopt;
+  }
+  Value v = op->getOperand(index);
+  if (!llvm::isa<qco::QubitType>(v.getType())) {
+    return std::nullopt;
+  }
+  return v;
+}
+
+[[nodiscard]] static std::optional<Value>
+getUnitaryQubitResult(qco::UnitaryOpInterface op, std::size_t index) {
+  if (index >= op.getNumQubits()) {
+    return std::nullopt;
+  }
+  Value v = op->getResult(index);
+  if (!llvm::isa<qco::QubitType>(v.getType())) {
+    return std::nullopt;
+  }
+  return v;
+}
+
+/// Extract the 2x2 unitary matrix associated with a single-qubit op.
+bool extractSingleQubitMatrix(qco::UnitaryOpInterface op, Matrix2x2& out) {
+  if (op.getUnitaryMatrix2x2(out)) {
+    return true;
+  }
+  qco::DynamicMatrix dynamic;
+  if (!op.getUnitaryMatrixDynamic(dynamic) || dynamic.rows() != 2 ||
+      dynamic.cols() != 2) {
+    return false;
+  }
+  out = Matrix2x2::fromElements(dynamic(0, 0), dynamic(0, 1), dynamic(1, 0),
+                                dynamic(1, 1));
+  return true;
+}
+
+/// 4×4 unitary for a two-qubit op (same layout as ``getUnitaryMatrix4x4``).
+bool extractTwoQubitMatrix(qco::UnitaryOpInterface op, Matrix4x4& out) {
+  if (native_synth::getBlockTwoQubitMatrix(op.getOperation(), out)) {
+    return true;
+  }
+  return op.getUnitaryMatrix4x4(out);
+}
+
+std::optional<Matrix4x4>
+computeTwoQubitUnitaryFromModule(const OwningOpRef<ModuleOp>& moduleOp) {
+  ModuleOp module = moduleOp.get();
+  if (!module) {
+    return std::nullopt;
+  }
+  Matrix4x4 unitary = Matrix4x4::identity();
+  llvm::DenseMap<Value, std::size_t> qubitIds;
+  std::size_t nextQubitId = 0;
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        if (auto alloc = llvm::dyn_cast<qco::AllocOp>(&rawOp)) {
+          if (nextQubitId >= 2) {
+            return std::nullopt;
+          }
+          qubitIds.try_emplace(alloc.getResult(), nextQubitId++);
+        }
+      }
+    }
+  }
+
+  auto getQubitId = [&](Value qubit) -> std::optional<std::size_t> {
+    auto it = qubitIds.find(qubit);
+    if (it == qubitIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        auto op = llvm::dyn_cast<qco::UnitaryOpInterface>(&rawOp);
+        if (!op) {
+          continue;
+        }
+        if (llvm::isa<qco::BarrierOp, qco::GPhaseOp>(op.getOperation())) {
+          continue;
+        }
+
+        if (op.isSingleQubit()) {
+          const auto qIn = getUnitaryQubitOperand(op, 0);
+          if (!qIn) {
+            return std::nullopt;
+          }
+          auto qid = getQubitId(*qIn);
+          if (!qid) {
+            return std::nullopt;
+          }
+          Matrix2x2 oneQ;
+          if (!extractSingleQubitMatrix(op, oneQ)) {
+            return std::nullopt;
+          }
+          unitary = decomposition::expandToTwoQubits(
+                        oneQ, static_cast<decomposition::QubitId>(*qid)) *
+                    unitary;
+          const auto qOut = getUnitaryQubitResult(op, 0);
+          if (!qOut) {
+            return std::nullopt;
+          }
+          qubitIds[*qOut] = *qid;
+          continue;
+        }
+
+        if (op.isTwoQubit()) {
+          const auto q0In = getUnitaryQubitOperand(op, 0);
+          const auto q1In = getUnitaryQubitOperand(op, 1);
+          if (!q0In || !q1In) {
+            return std::nullopt;
+          }
+          auto q0id = getQubitId(*q0In);
+          auto q1id = getQubitId(*q1In);
+          if (!q0id || !q1id) {
+            return std::nullopt;
+          }
+          Matrix4x4 twoQ;
+          if (!extractTwoQubitMatrix(op, twoQ)) {
+            return std::nullopt;
+          }
+          // Reorder the gate's (operand0, operand1) layout into the canonical
+          // (qubit 0, qubit 1) order used by `unitary`.
+          const llvm::SmallVector<decomposition::QubitId, 2> ids{
+              static_cast<decomposition::QubitId>(*q0id),
+              static_cast<decomposition::QubitId>(*q1id)};
+          unitary =
+              decomposition::fixTwoQubitMatrixQubitOrder(twoQ, ids) * unitary;
+          const auto q0Out = getUnitaryQubitResult(op, 0);
+          const auto q1Out = getUnitaryQubitResult(op, 1);
+          if (!q0Out || !q1Out) {
+            return std::nullopt;
+          }
+          qubitIds[*q0Out] = *q0id;
+          qubitIds[*q1Out] = *q1id;
+          continue;
+        }
+      }
+    }
+  }
+
+  if (nextQubitId != 2) {
+    return std::nullopt;
+  }
+  return unitary;
+}
+
+/// Kronecker-embed ``matrix`` on wire ``q`` into a ``2^N``-dim unitary (same
+/// index bit order as QCO 4×4 matrices: wire 0 is the high bit).
+TestMatrix expandOneQToN(const Matrix2x2& matrix, std::size_t q,
+                         std::size_t numQubits) {
+  const std::size_t dim = 1ULL << numQubits;
+  TestMatrix full(dim);
+  const auto bit = numQubits - 1 - q;
+  const std::size_t mask = 1ULL << bit;
+  for (std::size_t col = 0; col < dim; ++col) {
+    const std::size_t sIn = (col >> bit) & 1ULL;
+    const std::size_t rest = col & ~mask;
+    for (std::size_t sOut = 0; sOut < 2; ++sOut) {
+      const std::size_t row = rest | (sOut << bit);
+      full(row, col) = matrix(sOut, sIn);
+    }
+  }
+  return full;
+}
+
+/// Embed ``matrix`` on wires ``q0``, ``q1`` into a ``2^N``-dim unitary.
+TestMatrix expandTwoQToN(const Matrix4x4& matrix, std::size_t q0,
+                         std::size_t q1, std::size_t numQubits) {
+  const std::size_t dim = 1ULL << numQubits;
+  TestMatrix full(dim);
+  const auto bit0 = numQubits - 1 - q0;
+  const auto bit1 = numQubits - 1 - q1;
+  const std::size_t mask0 = 1ULL << bit0;
+  const std::size_t mask1 = 1ULL << bit1;
+  const std::size_t maskBoth = mask0 | mask1;
+  for (std::size_t col = 0; col < dim; ++col) {
+    const std::size_t s0In = (col >> bit0) & 1ULL;
+    const std::size_t s1In = (col >> bit1) & 1ULL;
+    // 2-bit index for the pair matches QCO 4×4 row/column layout.
+    const std::size_t smallIn = (s0In << 1) | s1In;
+    const std::size_t rest = col & ~maskBoth;
+    for (std::size_t smallOut = 0; smallOut < 4; ++smallOut) {
+      const std::size_t s0Out = (smallOut >> 1) & 1ULL;
+      const std::size_t s1Out = smallOut & 1ULL;
+      const std::size_t row = rest | (s0Out << bit0) | (s1Out << bit1);
+      full(row, col) = matrix(smallOut, smallIn);
+    }
+  }
+  return full;
+}
+
+/// Full ``2^N`` unitary from a QCO module (``alloc`` / ``static``, 1q/2q
+/// unitaries, ``ctrl`` with X/Z body). ``std::nullopt`` on unsupported ops or
+/// if ``N`` exceeds ``maxQubits``.
+std::optional<TestMatrix>
+computeNQubitUnitaryFromModule(const OwningOpRef<ModuleOp>& moduleOp,
+                               std::size_t maxQubits) {
+  ModuleOp module = moduleOp.get();
+  if (!module) {
+    return std::nullopt;
+  }
+
+  llvm::DenseMap<Value, std::size_t> qubitIds;
+  std::size_t numQubits = 0;
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        if (auto alloc = llvm::dyn_cast<qco::AllocOp>(&rawOp)) {
+          if (numQubits >= maxQubits) {
+            return std::nullopt;
+          }
+          qubitIds.try_emplace(alloc.getResult(), numQubits++);
+        } else if (auto staticOp = llvm::dyn_cast<qco::StaticOp>(&rawOp)) {
+          const auto idx = static_cast<std::size_t>(staticOp.getIndex());
+          if (idx >= maxQubits) {
+            return std::nullopt;
+          }
+          qubitIds.try_emplace(staticOp.getResult(), idx);
+          numQubits = std::max(numQubits, idx + 1);
+        }
+      }
+    }
+  }
+
+  if (numQubits == 0) {
+    return std::nullopt;
+  }
+
+  TestMatrix unitary = TestMatrix::identity(1ULL << numQubits);
+
+  auto getQubitId = [&](Value qubit) -> std::optional<std::size_t> {
+    auto it = qubitIds.find(qubit);
+    if (it == qubitIds.end()) {
+      return std::nullopt;
+    }
+    return it->second;
+  };
+
+  for (auto func : module.getOps<func::FuncOp>()) {
+    for (auto& block : func.getBlocks()) {
+      for (auto& rawOp : block.getOperations()) {
+        auto op = llvm::dyn_cast<qco::UnitaryOpInterface>(&rawOp);
+        if (!op) {
+          continue;
+        }
+        if (llvm::isa<qco::BarrierOp, qco::GPhaseOp>(op.getOperation())) {
+          continue;
+        }
+
+        if (op.isSingleQubit()) {
+          const auto qIn = getUnitaryQubitOperand(op, 0);
+          if (!qIn) {
+            return std::nullopt;
+          }
+          auto qid = getQubitId(*qIn);
+          if (!qid) {
+            return std::nullopt;
+          }
+          Matrix2x2 oneQ;
+          if (!extractSingleQubitMatrix(op, oneQ)) {
+            return std::nullopt;
+          }
+          unitary = expandOneQToN(oneQ, *qid, numQubits) * unitary;
+          const auto qOut = getUnitaryQubitResult(op, 0);
+          if (!qOut) {
+            return std::nullopt;
+          }
+          qubitIds[*qOut] = *qid;
+          continue;
+        }
+
+        if (op.isTwoQubit()) {
+          const auto q0In = getUnitaryQubitOperand(op, 0);
+          const auto q1In = getUnitaryQubitOperand(op, 1);
+          if (!q0In || !q1In) {
+            return std::nullopt;
+          }
+          auto q0id = getQubitId(*q0In);
+          auto q1id = getQubitId(*q1In);
+          if (!q0id || !q1id) {
+            return std::nullopt;
+          }
+          Matrix4x4 twoQ;
+          if (!extractTwoQubitMatrix(op, twoQ)) {
+            return std::nullopt;
+          }
+          unitary = expandTwoQToN(twoQ, *q0id, *q1id, numQubits) * unitary;
+          const auto q0Out = getUnitaryQubitResult(op, 0);
+          const auto q1Out = getUnitaryQubitResult(op, 1);
+          if (!q0Out || !q1Out) {
+            return std::nullopt;
+          }
+          qubitIds[*q0Out] = *q0id;
+          qubitIds[*q1Out] = *q1id;
+          continue;
+        }
+      }
+    }
+  }
+
+  return unitary;
+}
+
+} // namespace mlir::qco::native_synth_test
+
+using namespace mlir::qco::native_synth_test;
+
+namespace {
+
+struct NativeSynthMenuRow {
+  const char* name;
+  const char* nativeGates;
+  bool (*isNative)(OwningOpRef<ModuleOp>&);
+};
+
+// --- Inline circuit builders ---
+
+static void broadOneQThenCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.x(q0);
+  b.y(q1);
+  b.h(q0);
+  b.sx(q1);
+  b.rx(0.13, q0);
+  b.ry(-0.47, q1);
+  b.rz(0.29, q0);
+  b.cz(q0, q1);
+}
+
+static void zeroAngleThenCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.rx(0.0, q0);
+  b.ry(0.0, q1);
+  b.rz(0.0, q0);
+  b.p(0.0, q1);
+  b.cz(q0, q1);
+}
+
+static void ibmFractionalGateFamilies(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.rx(0.13, q1);
+  b.cx(q0, q1);
+  b.cz(q1, q0);
+  b.swap(q0, q1);
+  b.rzz(-0.33, q0, q1);
+  b.rzx(0.41, q0, q1);
+}
+
+static void hstycxTwoQ(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.s(q0);
+  b.t(q0);
+  b.y(q0);
+  b.cx(q0, q1);
+}
+
+static void cxYOnQ1(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.cx(q0, q1);
+  b.y(q1);
+}
+
+static void hCxTOnQ1(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q1);
+  b.cx(q0, q1);
+  b.t(q1);
+}
+
+static void xYSXCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.x(q0);
+  b.y(q0);
+  b.sx(q0);
+  b.cz(q0, q1);
+}
+
+static void hYCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.y(q0);
+  b.cx(q0, q1);
+}
+
+static void zCx(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.z(q0);
+  b.cx(q0, q1);
+}
+
+static void xHCz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.x(q0);
+  b.h(q0);
+  b.cz(q0, q1);
+}
+
+static void hq0Yq1CxSq0(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.y(q1);
+  b.cx(q0, q1);
+  b.s(q0);
+}
+
+static void hCxSq1(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.h(q0);
+  b.cx(q0, q1);
+  b.s(q1);
+}
+
+static void threeQGhz(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  const auto q2 = b.allocQubit();
+  b.h(q0);
+  b.cx(q0, q1);
+  b.cx(q1, q2);
+}
+
+static void determinismSwap(mlir::qc::QCProgramBuilder& b) {
+  const auto q0 = b.allocQubit();
+  const auto q1 = b.allocQubit();
+  b.swap(q0, q1);
+  b.dealloc(q0);
+  b.dealloc(q1);
+}
+
+} // namespace
+
+// --- NativeSpec / NativePolicy ---
+
+TEST(NativeSpecTest, ResolveIbmBasicCx) {
+  const auto spec = resolveNativeGatesSpec("x,sx,rz,cx");
+  ASSERT_TRUE(spec);
+  EXPECT_TRUE(spec->allowedGates.contains(NativeGateKind::Cx));
+  EXPECT_TRUE(spec->allowedGates.contains(NativeGateKind::X));
+  EXPECT_FALSE(spec->allowRzz);
+}
+
+TEST(NativeSpecTest, ResolveRejectsUnknownToken) {
+  EXPECT_FALSE(resolveNativeGatesSpec("x,sx,rz,not-a-gate").has_value());
+}
+
+TEST(NativeSpecTest, PhaseAliasPMatchesRzInIbmStyleMenu) {
+  const auto pMenu = resolveNativeGatesSpec("x,sx,p,cx");
+  const auto rzMenu = resolveNativeGatesSpec("x,sx,rz,cx");
+  ASSERT_TRUE(pMenu);
+  ASSERT_TRUE(rzMenu);
+  EXPECT_EQ(pMenu->allowedGates, rzMenu->allowedGates);
+}
+
+TEST(NativeSpecTest, EmitterEulerBasisForAxisPair) {
+  EXPECT_EQ(emitterEulerBasis(SingleQubitEmitterSpec{
+                .mode = SingleQubitMode::AxisPair, .axisPair = AxisPair::RxRz}),
+            EulerBasis::XZX);
+  EXPECT_EQ(emitterEulerBasis(SingleQubitEmitterSpec{
+                .mode = SingleQubitMode::AxisPair, .axisPair = AxisPair::RyRz}),
+            EulerBasis::ZYZ);
+}
+
+TEST(NativePolicyTest, UsesCxAndCzFromResolvedSpec) {
+  const auto cxOnly = resolveNativeGatesSpec("u,cx");
+  ASSERT_TRUE(cxOnly);
+  EXPECT_TRUE(usesCxEntangler(*cxOnly));
+  EXPECT_FALSE(usesCzEntangler(*cxOnly));
+
+  const auto both = resolveNativeGatesSpec("u,cx,cz");
+  ASSERT_TRUE(both);
+  EXPECT_TRUE(usesCxEntangler(*both));
+  EXPECT_TRUE(usesCzEntangler(*both));
+}
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+class NativePolicyAllowsOpTest : public ::testing::Test {
+protected:
+  MLIRContext context;
+  QCOProgramBuilder builder{&context};
+
+  void SetUp() override {
+    context.loadDialect<QCODialect>();
+    context.loadDialect<func::FuncDialect>();
+    context.loadDialect<arith::ArithDialect>();
+    context.loadDialect<scf::SCFDialect>();
+    builder.initialize();
+  }
+};
+
+TEST_F(NativePolicyAllowsOpTest, AllowsSingleQubitOpRespectsMenu) {
+  const auto spec = resolveNativeGatesSpec("x,sx,rz,cx");
+  ASSERT_TRUE(spec);
+  Value q = builder.staticQubit(0);
+  q = builder.x(q);
+  auto mod = builder.finalize();
+  ASSERT_TRUE(mod);
+  XOp xop;
+  mod->walk([&](XOp op) {
+    xop = op;
+    return WalkResult::interrupt();
+  });
+  ASSERT_TRUE(xop);
+  EXPECT_TRUE(allowsSingleQubitOp(
+      llvm::cast<UnitaryOpInterface>(xop.getOperation()), *spec));
+}
+
+TEST_F(NativePolicyAllowsOpTest, RejectsSingleQubitOpNotInMenu) {
+  const auto spec = resolveNativeGatesSpec("u,cx");
+  ASSERT_TRUE(spec);
+  Value q = builder.staticQubit(0);
+  q = builder.x(q);
+  auto mod = builder.finalize();
+  ASSERT_TRUE(mod);
+  XOp xop;
+  mod->walk([&](XOp op) {
+    xop = op;
+    return WalkResult::interrupt();
+  });
+  ASSERT_TRUE(xop);
+  EXPECT_FALSE(allowsSingleQubitOp(
+      llvm::cast<UnitaryOpInterface>(xop.getOperation()), *spec));
+}
+
+TEST_F(NativePolicyAllowsOpTest, CanDirectlyDecomposeToU3OnRxInCircuit) {
+  Value q = builder.staticQubit(0);
+  q = builder.rx(0.1, q);
+  auto mod = builder.finalize();
+  ASSERT_TRUE(mod);
+  RXOp rx;
+  mod->walk([&](RXOp op) {
+    rx = op;
+    return WalkResult::interrupt();
+  });
+  ASSERT_TRUE(rx);
+  EXPECT_TRUE(canDirectlyDecomposeToU3(rx.getOperation()));
+}
+
+// --- Pass profile coverage ---
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+class NativeSynthesisSwapProfileTest
+    : public NativeSynthesisPassTest,
+      public testing::WithParamInterface<NativeSynthMenuRow> {
+public:
+  using NativeSynthesisPassTest::onlyGenericU3CxOps;
+  using NativeSynthesisPassTest::onlyIbmBasicCxOps;
+  using NativeSynthesisPassTest::onlyIqmDefaultOps;
+};
+
+TEST_P(NativeSynthesisSwapProfileTest, DecomposesSwapToProfile) {
+  const NativeSynthMenuRow& param = GetParam();
+  expectNativeAfterSynthesis(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(), mlir::qc::swap);
+      },
+      param.nativeGates, param.isNative);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SwapMenuMatrix, NativeSynthesisSwapProfileTest,
+    testing::Values(
+        NativeSynthMenuRow{"IbmBasicCx", "x,sx,rz,cx",
+                           &NativeSynthesisSwapProfileTest::onlyIbmBasicCxOps},
+        NativeSynthMenuRow{"GenericU3Cx", "u,cx",
+                           &NativeSynthesisSwapProfileTest::onlyGenericU3CxOps},
+        NativeSynthMenuRow{"IqmDefault", "r,cz",
+                           &NativeSynthesisSwapProfileTest::onlyIqmDefaultOps}),
+    [](const testing::TestParamInfo<NativeSynthMenuRow>& info) {
+      return info.param.name;
+    });
+
+TEST_F(NativeSynthesisPassTest, DecomposesHstycxToIbmBasicCx) {
+  expectNativeAfterSynthesis(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(), hstycxTwoQ);
+      },
+      "x,sx,rz,cx", &NativeSynthesisPassTest::onlyIbmBasicCxOps);
+}
+
+TEST_F(NativeSynthesisPassTest, DecomposesCxYOnQ1ToIqmDefault) {
+  expectNativeAfterSynthesis(
+      [&] { return mlir::qc::QCProgramBuilder::build(context.get(), cxYOnQ1); },
+      "r,cz", &NativeSynthesisPassTest::onlyIqmDefaultOps);
+}
+
+TEST_F(NativeSynthesisPassTest, BroadOneQCanonicalizationOnIqmDefault) {
+  auto moduleOp =
+      mlir::qc::QCProgramBuilder::build(context.get(), broadOneQThenCz);
+  runNativeSynthesis(moduleOp, "r,cz");
+  EXPECT_TRUE(onlyIqmDefaultOps(moduleOp));
+}
+
+TEST_F(NativeSynthesisPassTest, ZeroAngleCanonicalizationOnRyRzCz) {
+  auto moduleOp =
+      mlir::qc::QCProgramBuilder::build(context.get(), zeroAngleThenCz);
+  runNativeSynthesis(moduleOp, "ry,rz,cz");
+  EXPECT_TRUE(onlyAxisPairRyRzCzOps(moduleOp));
+}
+
+TEST_F(NativeSynthesisPassTest, DecomposesCxToCzForIbmBasicCzProfile) {
+  expectNativeAfterSynthesis(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(), hCxTOnQ1);
+      },
+      "x,sx,rz,cz", &NativeSynthesisPassTest::onlyIbmBasicCzOps);
+}
+
+TEST_F(NativeSynthesisPassTest, DecomposesToIqmDefaultProfile) {
+  expectNativeAfterSynthesis(
+      [&] { return mlir::qc::QCProgramBuilder::build(context.get(), xYSXCz); },
+      "r,cz", &NativeSynthesisPassTest::onlyIqmDefaultOps);
+}
+
+TEST_F(NativeSynthesisPassTest, DecomposesToIbmFractionalProfile) {
+  expectNativeAfterSynthesis(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(),
+                                                 ibmFractionalGateFamilies);
+      },
+      "x,sx,rz,rx,rzz,cz", &NativeSynthesisPassTest::onlyIbmFractionalOps);
+}
+
+TEST_F(NativeSynthesisPassTest, DecomposesToAxisPairRxRzCxProfile) {
+  expectNativeAfterSynthesis(
+      [&] { return mlir::qc::QCProgramBuilder::build(context.get(), hYCx); },
+      "rx,rz,cx", &NativeSynthesisPassTest::onlyAxisPairRxRzCxOps);
+}
+
+TEST_F(NativeSynthesisPassTest, DecomposesRzToAxisPairRxRyCxProfile) {
+  expectNativeAfterSynthesis(
+      [&] { return mlir::qc::QCProgramBuilder::build(context.get(), zCx); },
+      "rx,ry,cx", &NativeSynthesisPassTest::onlyAxisPairRxRyCxOps);
+}
+
+TEST_F(NativeSynthesisPassTest, GenericProfileMatchesGenericU3CxBehavior) {
+  expectEquivalentAndNativeAfterSynthesis(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(), hq0Yq1CxSq0);
+      },
+      "u,cx", &NativeSynthesisPassTest::onlyGenericU3CxOps,
+      computeTwoQubitUnitaryFromModule);
+}
+
+TEST_F(NativeSynthesisPassTest, GenericProfileMatchesAxisPairRyRzCzBehavior) {
+  expectEquivalentAndNativeAfterSynthesis(
+      [&] { return mlir::qc::QCProgramBuilder::build(context.get(), xHCz); },
+      "ry,rz,cz", &NativeSynthesisPassTest::onlyAxisPairRyRzCzOps,
+      computeTwoQubitUnitaryFromModule);
+}
+
+TEST_F(NativeSynthesisPassTest, CustomProfileAcceptsMultipleEntanglersMenu) {
+  expectEquivalentAndNativeAfterSynthesis(
+      [&] { return mlir::qc::QCProgramBuilder::build(context.get(), hCxSq1); },
+      "u,cx,cz", &NativeSynthesisPassTest::onlyGenericU3CxOrCzOps,
+      computeTwoQubitUnitaryFromModule);
+}
+
+TEST_F(NativeSynthesisPassTest, FailsForUnsupportedNativeGateMenu) {
+  expectSynthesisFailure(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(), mlir::qc::h);
+      },
+      "not-a-gate");
+}
+
+TEST_F(NativeSynthesisPassTest, FailsForNativeGateMenuWithoutSingleQEmitter) {
+  expectSynthesisFailure(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(),
+                                                 mlir::qc::singleControlledX);
+      },
+      "cx,cz");
+}
+
+TEST_F(NativeSynthesisPassTest, FailsForMultiControlledGateStructure) {
+  expectSynthesisFailure(
+      [&] {
+        return mlir::qc::QCProgramBuilder::build(context.get(),
+                                                 mlir::qc::multipleControlledX);
+      },
+      "x,sx,rz,cx");
+}
+
+TEST_F(NativeSynthesisPassTest, CandidateSelectionIsDeterministicAcrossRuns) {
+  auto buildFn = [&] {
+    return mlir::qc::QCProgramBuilder::build(context.get(), determinismSwap);
+  };
+  auto firstModule = buildFn();
+  runNativeSynthesis(firstModule, "u,cx");
+  auto secondModule = buildFn();
+  runNativeSynthesis(secondModule, "u,cx");
+  EXPECT_EQ(moduleToString(firstModule), moduleToString(secondModule));
+}
+
+TEST_F(NativeSynthesisPassTest, ThreeQubitGhzEquivalentOnCoreProfiles) {
+  for (const auto& profileCase : coreEquivalenceProfiles()) {
+    auto expected = mlir::qc::QCProgramBuilder::build(context.get(), threeQGhz);
+    runQcToQco(expected);
+    const auto expectedUnitary = computeNQubitUnitaryFromModule(expected);
+    ASSERT_TRUE(expectedUnitary.has_value());
+
+    auto synthesized =
+        mlir::qc::QCProgramBuilder::build(context.get(), threeQGhz);
+    runNativeSynthesis(synthesized, profileCase.nativeGates);
+    EXPECT_TRUE(profileCase.isNative(synthesized));
+    const auto synthesizedUnitary = computeNQubitUnitaryFromModule(synthesized);
+    ASSERT_TRUE(synthesizedUnitary.has_value());
+    EXPECT_TRUE(
+        isEquivalentUpToGlobalPhase(*expectedUnitary, *synthesizedUnitary));
+  }
+}
diff --git a/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp b/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp
index afa0792415..cb12365bd5 100644
--- a/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp
+++ b/mlir/unittests/Dialect/QCO/Utils/test_unitary_matrix.cpp
@@ -12,8 +12,11 @@
 
 #include <gtest/gtest.h>
 
+#include <array>
 #include <cmath>
 #include <complex>
+#include <cstddef>
+#include <random>
 #include <utility>
 
 using namespace mlir::qco;
@@ -303,6 +306,140 @@ TEST(Matrix4x4, AssignFromDynamicMatrix) {
   EXPECT_FALSE(out.assignFrom(DynamicMatrix::identity(2)));
 }
 
+TEST(UnitaryMatrix2x2, TransposeAndIsIdentity) {
+  const Matrix2x2 m = Matrix2x2::fromElements(1, 2i, 3, 4);
+  EXPECT_TRUE(m.transpose().isApprox(Matrix2x2::fromElements(1, 3, 2i, 4)));
+  EXPECT_TRUE(Matrix2x2::identity().isIdentity());
+  EXPECT_FALSE(pauliX().isIdentity());
+}
+
+TEST(UnitaryMatrix4x4, TransposeAndIsIdentity) {
+  Matrix4x4 m = Matrix4x4::identity();
+  m(0, 3) = 2i;
+  m(3, 0) = 5.0;
+  const Matrix4x4 t = m.transpose();
+  EXPECT_EQ(t(3, 0), 2i);
+  EXPECT_EQ(t(0, 3), 5.0);
+  EXPECT_TRUE(Matrix4x4::identity().isIdentity());
+  EXPECT_FALSE(swapMatrix().isIdentity());
+}
+
+TEST(UnitaryMatrix4x4, DiagonalColumnsAndParts) {
+  Matrix4x4 m =
+      Matrix4x4::fromElements(Complex{1, 1}, 0, 0, 0, 0, Complex{2, 2}, 0, 0, 0,
+                              0, Complex{3, 3}, 0, 0, 0, 0, Complex{4, 4});
+  const auto diag = m.diagonal();
+  EXPECT_EQ(diag[0], (Complex{1, 1}));
+  EXPECT_EQ(diag[3], (Complex{4, 4}));
+  EXPECT_TRUE(Matrix4x4::fromDiagonal(diag).isApprox(m));
+
+  const auto col1 = m.column(1);
+  EXPECT_EQ(col1[1], (Complex{2, 2}));
+  Matrix4x4 n = Matrix4x4::identity();
+  n.setColumn(2, {1i, 2i, 3i, 4i});
+  EXPECT_EQ(n(0, 2), 1i);
+  EXPECT_EQ(n(3, 2), 4i);
+
+  const auto re = m.realPart();
+  const auto im = m.imagPart();
+  EXPECT_EQ(re[0], 1.0);
+  EXPECT_EQ(im[0], 1.0);
+  EXPECT_EQ(re[15], 4.0);
+  EXPECT_EQ(im[15], 4.0);
+}
+
+TEST(UnitaryMatrix4x4, KroneckerProduct) {
+  const Matrix2x2 x = pauliX();
+  // X (x) I should swap the high bit.
+  const Matrix4x4 xi = kron(x, Matrix2x2::identity());
+  EXPECT_TRUE(xi.isApprox(Matrix4x4::fromElements(0, 0, 1, 0, // row 0
+                                                  0, 0, 0, 1, // row 1
+                                                  1, 0, 0, 0, // row 2
+                                                  0, 1, 0, 0)));
+  // I (x) X swaps the low bit.
+  const Matrix4x4 ix = kron(Matrix2x2::identity(), x);
+  EXPECT_TRUE(ix.isApprox(Matrix4x4::fromElements(0, 1, 0, 0, // row 0
+                                                  1, 0, 0, 0, // row 1
+                                                  0, 0, 0, 1, // row 2
+                                                  0, 0, 1, 0)));
+}
+
+TEST(UnitaryMatrix2x2, ScalarLeftMultiply) {
+  const Matrix2x2 x = pauliX();
+  const Complex scalar = std::exp(1i * 0.5);
+  EXPECT_TRUE((scalar * x).isApprox(x * scalar));
+}
+
+TEST(UnitaryMatrix4x4, ScalarLeftMultiply) {
+  const Matrix4x4 swap = swapMatrix();
+  const Complex scalar = std::exp(1i * 0.25);
+  EXPECT_TRUE((scalar * swap).isApprox(swap * scalar));
+}
+
+TEST(JacobiEigensolver, DiagonalMatrix) {
+  std::array<double, 16> a{};
+  a[0] = 3.0;
+  a[5] = 1.0;
+  a[10] = 4.0;
+  a[15] = 2.0;
+  const SymmetricEigen4 result = jacobiSymmetricEigen(a);
+  EXPECT_NEAR(result.eigenvalues[0], 1.0, 1e-12);
+  EXPECT_NEAR(result.eigenvalues[1], 2.0, 1e-12);
+  EXPECT_NEAR(result.eigenvalues[2], 3.0, 1e-12);
+  EXPECT_NEAR(result.eigenvalues[3], 4.0, 1e-12);
+}
+
+TEST(JacobiEigensolver, ReconstructsRandomSymmetric) {
+  std::mt19937 rng(0xC0FFEE);
+  std::uniform_real_distribution<double> dist(-2.0, 2.0);
+  for (int trial = 0; trial < 50; ++trial) {
+    std::array<double, 16> a{};
+    for (std::size_t i = 0; i < 4; ++i) {
+      for (std::size_t j = i; j < 4; ++j) {
+        const double value = dist(rng);
+        a[(i * 4) + j] = value;
+        a[(j * 4) + i] = value;
+      }
+    }
+    const SymmetricEigen4 result = jacobiSymmetricEigen(a);
+
+    // Eigenvalues are ascending.
+    for (std::size_t i = 0; i + 1 < 4; ++i) {
+      EXPECT_LE(result.eigenvalues[i], result.eigenvalues[i + 1] + 1e-12);
+    }
+
+    // Eigenvectors are orthonormal: V^T V == I.
+    const Matrix4x4& v = result.eigenvectors;
+    EXPECT_TRUE((v.transpose() * v).isIdentity(1e-9));
+
+    // Reconstruction: V D V^T == A.
+    const Matrix4x4 d =
+        Matrix4x4::fromDiagonal({result.eigenvalues[0], result.eigenvalues[1],
+                                 result.eigenvalues[2], result.eigenvalues[3]});
+    const Matrix4x4 reconstructed = v * d * v.transpose();
+    Matrix4x4 original{};
+    for (std::size_t k = 0; k < 16; ++k) {
+      original(k / 4, k % 4) = a[k];
+    }
+    EXPECT_TRUE(reconstructed.isApprox(original, 1e-9));
+  }
+}
+
+TEST(JacobiEigensolver, HandlesDegenerateSpectrum) {
+  // A scalar multiple of the identity: every vector is an eigenvector, but the
+  // returned basis must still be orthonormal.
+  std::array<double, 16> a{};
+  for (std::size_t i = 0; i < 4; ++i) {
+    a[(i * 4) + i] = 2.5;
+  }
+  const SymmetricEigen4 result = jacobiSymmetricEigen(a);
+  for (const double value : result.eigenvalues) {
+    EXPECT_NEAR(value, 2.5, 1e-12);
+  }
+  const Matrix4x4& v = result.eigenvectors;
+  EXPECT_TRUE((v.transpose() * v).isIdentity(1e-9));
+}
+
 TEST(DynamicMatrix, IsApproxOverloads) {
   const Matrix1x1 phase = Matrix1x1::fromElements(Complex{0.25, 0.5});
   const Matrix2x2 x = pauliX();
diff --git a/mlir/unittests/TestCaseUtils.h b/mlir/unittests/TestCaseUtils.h
index 570c86c87f..7603fda8be 100644
--- a/mlir/unittests/TestCaseUtils.h
+++ b/mlir/unittests/TestCaseUtils.h
@@ -18,6 +18,8 @@
 #include <llvm/Support/raw_ostream.h>
 #include <mlir/IR/BuiltinOps.h>
 
+#include <cmath>
+#include <complex> // NOLINT(misc-include-cleaner)
 #include <cstddef>
 #include <cstdlib>
 #include <string>
@@ -26,6 +28,26 @@
 
 namespace mqt::test {
 
+/**
+ * Check whether two unitary matrices are equal up to a single unit-modulus
+ * global phase factor.
+ *
+ * The comparison is symmetric and numerically stable in the sense that a near
+ * zero overlap (``|trace(rhs^H * lhs)| <= atol``) is treated as "not
+ * equivalent" to avoid division by a tiny number.
+ */
+template <typename Matrix>
+[[nodiscard]] bool isEquivalentUpToGlobalPhase(const Matrix& lhs,
+                                               const Matrix& rhs,
+                                               double atol = 1e-10) {
+  const auto overlap = (rhs.adjoint() * lhs).trace();
+  if (std::abs(overlap) <= atol) {
+    return false;
+  }
+  const auto factor = overlap / std::abs(overlap);
+  return lhs.isApprox(factor * rhs, atol);
+}
+
 template <typename BuilderT> struct NamedBuilder {
   const char* name = nullptr;
   void (*fn)(BuilderT&) = nullptr;
diff --git a/mlir/unittests/programs/qc_programs.cpp b/mlir/unittests/programs/qc_programs.cpp
index 8410afd7a8..e3d0e81b65 100644
--- a/mlir/unittests/programs/qc_programs.cpp
+++ b/mlir/unittests/programs/qc_programs.cpp
@@ -11,6 +11,7 @@
 #include "qc_programs.h"
 
 #include "mlir/Dialect/QC/Builder/QCProgramBuilder.h"
+#include "mlir/IR/Value.h"
 
 #include <numbers>
 
@@ -1616,5 +1617,4 @@ void nestedForLoopCtrlOpWithExtractedQubit(QCProgramBuilder& b) {
     b.cx(reg[0], q0);
   });
 }
-
 } // namespace mlir::qc