shixun404
diff --git a/‎cupynumeric/_thunk/deferred.py‎
Lines changed: 49 additions & 0 deletions b/‎cupynumeric/_thunk/deferred.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎cupynumeric/_thunk/eager.py‎
Lines changed: 6 additions & 0 deletions b/‎cupynumeric/_thunk/eager.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cupynumeric/_thunk/thunk.py‎
Lines changed: 8 additions & 0 deletions b/‎cupynumeric/_thunk/thunk.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎cupynumeric/linalg/linalg.py‎
Lines changed: 143 additions & 2 deletions b/‎cupynumeric/linalg/linalg.py‎
Lines changed: 143 additions & 2 deletions
diff --git a/‎examples/tssvd.py‎
Lines changed: 111 additions & 0 deletions b/‎examples/tssvd.py‎
Lines changed: 111 additions & 0 deletions
@@ -3916,3 +3916,52 @@ def stencil_hint(
         legate_runtime.prefetch_bloated_instances(
             self.base, low_offsets, high_offsets, False
         )
+
+    @auto_convert("rhs1_thunk", "rhs2_thunk")
+    def ts_matmul(
+        self,
+        rhs1_thunk: Any,
+        rhs2_thunk: Any) -> Any:
+
+        lhs_thunk: NumPyThunk = self
+
+        # Clear output array
+        lhs_thunk.fill(np.array(0, dtype=lhs_thunk.dtype))
+        lhs = lhs_thunk.base # type: ignore
+
+        rhs1 = rhs1_thunk.base
+        rhs2 = rhs2_thunk.base
+
+        m = lhs.shape[0]
+        n = lhs.shape[1]
+        k = rhs1.shape[1]
+        unbatched = 1
+
+        assert m == rhs1.shape[0]
+        assert n == rhs2.shape[1]
+        assert k == rhs2.shape[0]
+        lhs = lhs.promote(1, k)
+        rhs1 = rhs1.promote(2, n)
+        rhs2 = rhs2.promote(0, m)
+
+        task = legate_runtime.create_auto_task(
+            self.library, CuPyNumericOpCode.MATMUL
+        )
+        p_lhs = task.add_reduction(lhs, ReductionOpKind.ADD)
+        p_rhs1 = task.add_input(rhs1)
+        p_rhs2 = task.add_input(rhs2)
+        #
+        # specify unbatched matrix multiplication:
+        #
+        task.add_scalar_arg(unbatched, ty.uint32)
+
+        task.add_constraint(align(p_lhs, p_rhs1))
+        task.add_constraint(align(p_lhs, p_rhs2))
+        #
+        # additional constraints:
+        #
+        # task.add_constraint(broadcast(p_rhs1, (0,)))
+        # task.add_constraint(broadcast(p_rhs2, (1,)))
+        task.add_constraint(broadcast(p_lhs))
+        #
+        task.execute()
@@ -2128,3 +2128,9 @@ def stencil_hint(
     ) -> None:
         if self.deferred is not None:
             self.deferred.stencil_hint(low_offsets, high_offsets)
+
+    def ts_matmul(
+        self,
+        rhs1_thunk: Any,
+        rhs2_thunk: Any) -> Any:
+        np.matmul(rhs1_thunk.array, rhs2_thunk.array, out=self.array)
@@ -1651,3 +1651,11 @@ def stencil_hint(
         high_offsets: tuple[int, ...],
     ) -> None:
         ...
+
+    @abstractmethod
+    def ts_matmul(
+        self,
+        rhs1_thunk: Any,
+        rhs2_thunk: Any
+    ) -> Any:
+        ...
@@ -32,11 +32,21 @@
         normalize_axis_tuple,
     )
 
-from legate.core import get_machine
+from cupynumeric.config import CuPyNumericOpCode
+from legate.core import (
+    get_machine,
+    get_legate_runtime,
+    ReductionOpKind,
+    align,
+    broadcast,
+)
 
 from .._array.util import add_boilerplate, convert_to_cupynumeric_ndarray
 from .._module import dot, empty_like, eye, matmul, ndarray
-from .._module.creation_shape import zeros, zeros_like
+from .._module.array_rearrange import flip
+from .._module.creation_shape import empty, zeros, zeros_like
+from .._module.creation_matrices import diag
+from .._module.ssc_sorting import argsort
 from .._ufunc.math import add, sqrt as _sqrt
 from ._exception import LinAlgError
 
@@ -1571,3 +1581,134 @@ def expm(a: ndarray, method: str = "pade") -> ndarray:
                 mdeg, s = expm_func(a[idx], output[idx])
 
     return output
+
+
+@add_boilerplate("a")
+def tssvd(a: ndarray) -> tuple[ndarray, ...]:
+    """
+    Tall-skinny (TS) Singular Value Decomposition.
+
+    Parameters
+    ----------
+    a : (M, N) array_like
+        Array like, dimension 2.
+
+    Returns
+    -------
+    u : (M, N) array_like
+        Unitary array(s).
+    s : (N) array_like
+        The singular values, sorted in descending order
+    vh : (N, N) array_like
+        Unitary array(s).
+
+    Raises
+    ------
+    LinAlgError
+        If TS-SVD computation does not converge.
+
+    Notes
+    -----
+    This routine is only efficient if ``M >> N``. In particular, it assumes that
+    an ``(N, N)`` matrix can fit within a single processor memory.
+            
+    Implements the algorithm described in [1]_.
+    
+    Requires ``a.T @ a`` to not be singular.
+    Input matrix must be non-singular.
+
+    See Also
+    --------
+    numpy.linalg.svd
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
+    
+    
+    References
+    ----------
+    .. [1] https://stanford.edu/~rezab/classes/cme323/S22/notes/L17/cme323_lec17.pdf
+    """
+    if a.ndim != 2 or a.size <= 1:
+        raise ValueError(f"Invalid input shape for tssvd: {a.shape}")
+
+    m_info = get_machine()
+    num_PEs = m_info.count()
+
+    # A.T*A:
+    #
+    # unbatched way (there's a bug resulting in 0-matrix, it seems):
+    #{
+    m = a.shape[0]
+    n = a.shape[1]
+
+    # TODO: Grammian API:
+    #
+    a2 = empty(shape=(n, n), dtype=a.dtype)
+    ah = a.transpose().conj()
+    a2._thunk.ts_matmul(ah._thunk, a._thunk)
+    #}
+    #
+    # batched way (slower, but passes):
+    #
+    # a2 = matmul(a.transpose().conj(), a)
+
+    # eigen-vals, eigen-vecs of A.T*A:
+    #
+    ew, ev = eigh(a2)
+
+    if any(abs(ew) <= np.finfo(a.dtype).eps):
+        raise LinAlgError("Singular matrix. Method cannot be applied.")
+
+    # svals = map sqrt ew
+    #
+    svals = _sqrt(ew)
+
+    # bring to standard form;
+    # i.e., decreasing singular values
+    #
+    # generate index permutation, pi
+    # via sort-by-key decreasingly:
+    #
+    d_indices = zeros(shape=(n, ), dtype=np.int64)
+    with m_info[0]: # !
+        d_indices = argsort(svals)
+        #
+        # reverse:
+        #
+        # d_indices = d_indices[::-1] # Error: not implemented
+        d_indices = flip(d_indices)
+
+    # V.T:
+    #
+    vt = ev.transpose().conj()
+
+    reciprocal_svals = 1.0/svals
+    Sinv = diag(reciprocal_svals)
+
+    # U = A*V*inv(S):
+    #
+    # B = matmul(ev, Sinv)
+    # u = matmul(a, B)
+
+    B = empty(shape=(n, n), dtype=a.dtype)
+    B._thunk.ts_matmul(ev._thunk, Sinv._thunk)
+
+    u  = empty(shape=(m, n), dtype=a.dtype)
+    u._thunk.ts_matmul(a._thunk, B._thunk)
+    
+    # re-arrange svals decreasingly:
+    #
+    svals = svals[d_indices]
+
+    # permute columns of U with pi:
+    #
+    # u = u[:, d_indices]
+    u = matmul(u, eye(u.shape[1])[d_indices].T)
+
+    # permute rows of V.T with pi:
+    #
+    vt = vt[d_indices]
+
+    return u, svals, vt
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+
+# Copyright 2025 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import re
+
+from benchmark import *
+
+import cupynumeric as num
+import numpy as np
+
+
+def check_result(a, u, s, vh):
+    print("Checking result...")
+
+    # (u * s) @ vh
+    a2 = num.matmul(u * s, vh)
+    print("PASS!" if num.allclose(a, a2) else "FAIL!")
+
+
+# make random real full column rank mxn, m>n matrix:
+#
+def make_random_matrix(
+        m: int, n: int, scale: float = 10.0,
+        dtype_=np.dtype("float64") ) -> np.ndarray:
+    num.random.seed(6174)
+    
+    mat = scale * num.random.rand(m, n)
+
+    mat = mat.astype(dtype_)
+
+    # strictly diagonally dominant:
+    #
+    for i in range(n):
+        mat[i, i] = 1.0 + num.sum(num.abs(mat[i,:]))
+
+    return mat
+
+
+def run_tssvd(m, n, perform_check, timing):
+    A = make_random_matrix(m, n)
+
+    timer.start()
+    u, s, vh = num.linalg.tssvd(A)
+    total = timer.stop()
+
+    if perform_check:
+        check_result(A, u, s, vh)
+
+    if timing:
+        print(f"TSSVD elapsed Time: {total:.3f} ms")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-t",
+        "--time",
+        dest="timing",
+        action="store_true",
+        help="perform timing",
+    )
+    parser.add_argument(
+        "-m",
+        "--rows",
+        type=int,
+        default=10,
+        dest="m",
+        help="number of rows in the matrix",
+    )
+    parser.add_argument(
+        "-n",
+        "--cols",
+        type=int,
+        default=10,
+        dest="n",
+        help="number of cols in the matrix",
+    )
+    parser.add_argument(
+        "--check",
+        dest="check",
+        action="store_true",
+        help="compare result to numpy",
+    )
+    args, num, timer = parse_args(parser)
+
+    run_benchmark(
+        run_tssvd,
+        args.benchmark,
+        "TSSVD",
+        (
+            args.m,
+            args.n,
+            args.check,
+            args.timing,
+        ),
+    )