Manually added LRT logic from scverse#178

kimsjune · web-flow · commit d507fba955c9 · 2025-06-05T17:53:03.000-04:00
diff --git a/pydeseq2/ds.py b/pydeseq2/ds.py
@@ -2,6 +2,7 @@
 import time
 import warnings
 from typing import Literal
+from typing import Tuple
 
 # import anndata as ad
 import numpy as np
@@ -14,7 +15,7 @@
 from pydeseq2.inference import Inference
 from pydeseq2.utils import lowess
 from pydeseq2.utils import make_MA_plot
-
+from pydeseq2.utils import lrt_test
 
 class DeseqStats:
     """PyDESeq2 statistical tests for differential expression.
@@ -131,6 +132,7 @@ class DeseqStats:
     def __init__(
         self,
         dds: DeseqDataSet,
+        test: Literal["wald", "LRT"] = "wald",
         contrast: list[str] | np.ndarray,
         alpha: float = 0.05,
         cooks_filter: bool = True,
@@ -150,6 +152,10 @@ def __init__(
 
         self.dds = dds
 
+        if test not in ("wald", "LRT"):
+            raise ValueError(f"Available tests are `wald` and `LRT`. Got: {test}.")
+        self.test = test
+
         self.alpha = alpha
         self.cooks_filter = cooks_filter
         self.independent_filter = independent_filter
@@ -213,12 +219,23 @@ def __init__(
                 "refitted. Please run 'dds.refit()' first or set 'dds.refit_cooks' "
                 "to False."
             )
+        
+        self.p_values: pd.Series
+        self.statistics: pd.Series
+        self.SE: pd.Series
+
+
+
+        
+
+
 
     @property
     def variables(self):
         """Get the names of the variables used in the model definition."""
         return self.dds.variables
 
+
     def summary(
         self,
         **kwargs,
@@ -262,6 +279,11 @@ def summary(
             rerun_summary = True
             self.run_wald_test()
 
+            if self.test == "wald":
+                self.run_wald_test()
+            else:
+                self.run_likelihood_ratio_test()
+
         if self.cooks_filter:
             # Filter p-values based on Cooks outliers
             self._cooks_filtering()
@@ -280,6 +302,8 @@ def summary(
         self.results_df["baseMean"] = self.base_mean
         self.results_df["log2FoldChange"] = self.LFC @ self.contrast_vector / np.log(2)
         self.results_df["lfcSE"] = self.SE / np.log(2)
+        if self.test == "wald":
+            self.results_df["lfcSE"] = self.SE / np.log(2)
         self.results_df["stat"] = self.statistics
         self.results_df["pvalue"] = self.p_values
         self.results_df["padj"] = self.padj
@@ -348,16 +372,85 @@ def run_wald_test(self) -> None:
         if not self.quiet:
             print(f"... done in {end-start:.2f} seconds.\n", file=sys.stderr)
 
-        self.p_values: pd.Series = pd.Series(pvals, index=self.dds.var_names)
-        self.statistics: pd.Series = pd.Series(stats, index=self.dds.var_names)
-        self.SE: pd.Series = pd.Series(se, index=self.dds.var_names)
+        self.p_values = pd.Series(pvals, index=self.dds.var_names)
+        self.statistics = pd.Series(stats, index=self.dds.var_names)
+        self.SE = pd.Series(se, index=self.dds.var_names)
 
         # Account for possible all_zeroes due to outlier refitting in DESeqDataSet
         if self.dds.refit_cooks and self.dds.varm["replaced"].sum() > 0:
             self.SE.loc[self.dds.new_all_zeroes_genes] = 0.0
             self.statistics.loc[self.dds.new_all_zeroes_genes] = 0.0
             self.p_values.loc[self.dds.new_all_zeroes_genes] = 1.0
 
+    def run_likelihood_ratio_test(self) -> None:
+        """Perform a Likelihood Ratio test.
+        Get gene-wise p-values for gene over/under-expression.
+        """
+
+        num_genes = self.dds.n_vars
+        num_vars = self.design_matrix.shape[1]
+
+        # XXX: Raise a warning if LFCs are shrunk.
+
+        def reduce(
+            design_matrix: np.ndarray, ridge_factor: np.ndarray
+        ) -> Tuple[np.ndarray, np.ndarray]:
+            indices = np.full(design_matrix.shape[1], True, dtype=bool)
+            indices[self.contrast_idx] = False
+            return design_matrix[:, indices], ridge_factor[:, indices][indices]
+
+        # Set regularization factors.
+        if self.prior_LFC_var is not None:
+            ridge_factor = np.diag(1 / self.prior_LFC_var**2)
+        else:
+            ridge_factor = np.diag(np.repeat(1e-6, num_vars))
+
+        design_matrix = self.design_matrix.values
+        LFCs = self.LFC.values
+
+        reduced_design_matrix, reduced_ridge_factor = reduce(design_matrix, ridge_factor)
+        self.dds.obsm["reduced_design_matrix"] = reduced_design_matrix
+
+        if not self.quiet:
+            print("Running LRT tests...", file=sys.stderr)
+        start = time.time()
+        with parallel_backend("loky", inner_max_num_threads=1):
+            res = Parallel(
+                n_jobs=self.n_processes,
+                verbose=self.joblib_verbosity,
+                batch_size=self.batch_size,
+            )(
+                delayed(lrt_test)(
+                    counts=self.dds.X[:, i],
+                    design_matrix=design_matrix,
+                    reduced_design_matrix=reduced_design_matrix,
+                    size_factors=self.dds.obsm["size_factors"],
+                    disp=self.dds.varm["dispersions"][i],
+                    lfc=LFCs[i],
+                    min_mu=self.dds.min_mu,
+                    ridge_factor=ridge_factor,
+                    reduced_ridge_factor=reduced_ridge_factor,
+                    beta_tol=self.dds.beta_tol,
+                )
+                for i in range(num_genes)
+            )
+        end = time.time()
+        if not self.quiet:
+            print(f"... done in {end-start:.2f} seconds.\n", file=sys.stderr)
+
+        pvals, stats = zip(*res)
+
+        self.p_values = pd.Series(pvals, index=self.dds.var_names)
+        self.statistics = pd.Series(stats, index=self.dds.var_names)
+
+        # Account for possible all_zeroes due to outlier refitting in DESeqDataSet
+        if self.dds.refit_cooks and self.dds.varm["replaced"].sum() > 0:
+            self.statistics.loc[self.dds.new_all_zeroes_genes] = 0.0
+            self.p_values.loc[self.dds.new_all_zeroes_genes] = 1.0
+
+
+
+
     # TODO update this to reflect the new contrast format
     def lfc_shrink(self, coeff: str, adapt: bool = True) -> None:
         """LFC shrinkage with an apeGLM prior :cite:p:`DeseqStats-zhu2019heavy`.
diff --git a/pydeseq2/utils.py b/pydeseq2/utils.py
@@ -12,6 +12,7 @@
 from scipy.optimize import minimize  # type: ignore
 from scipy.special import gammaln  # type: ignore
 from scipy.special import polygamma  # type: ignore
+from scipy.stats import chi2  # type: ignore
 from scipy.stats import norm  # type: ignore
 from sklearn.linear_model import LinearRegression  # type: ignore
 
@@ -811,6 +812,82 @@ def less_abs(lfc_null):
 
     return wald_p_value, wald_statistic, wald_se
 
+def lrt_test(
+    counts: np.ndarray,
+    design_matrix: np.ndarray,
+    reduced_design_matrix: np.ndarray,
+    size_factors: np.ndarray,
+    disp: float,
+    lfc: np.ndarray,
+    min_mu: float,
+    ridge_factor: np.ndarray,
+    reduced_ridge_factor: np.ndarray,
+    beta_tol: float,
+) -> Tuple[float, float]:
+    """Run likelihood ratio test for differential expression.
+    Compute likelihood ratio test statistics and p-values from
+    dispersion and LFC estimates.
+    Parameters
+    ----------
+    counts : ndarray
+        Raw counts for a given gene.
+    design_matrix : ndarray
+        Design matrix.
+    reduced_design_matrix : ndarray
+        Reduced design matrix.
+    size_factors : ndarray
+        DESeq2 normalization factors.
+    disp : float
+        Dispersion estimate.
+    lfc : ndarray
+        Log-fold change estimate (in natural log scale).
+    min_mu : float
+        Lower bound on estimated means, to ensure numerical stability.
+        (default: ``0.5``).
+    ridge_factor : ndarray
+        Regularization factors.
+    reduced_ridge_factor : ndarray
+        Reduced regularization factors.
+    beta_tol : float
+        Stopping criterion for IRWLS:
+        :math:`\vert dev - dev_{old}\vert / \vert dev + 0.1 \vert < \beta_{tol}`.
+        (default: ``1e-8``).
+    Returns
+    -------
+    lrt_p_value : float
+        Estimated p-value.
+    lrt_statistic : float
+        LRT statistic.
+    """
+
+    def reg_nb_nll(
+        beta: np.ndarray, design_matrix: np.ndarray, ridge_factor: np.ndarray
+    ) -> float:
+        # closure to minimize
+        mu_ = np.maximum(size_factors * np.exp(design_matrix @ beta), min_mu)
+        val = nb_nll(counts, mu_, disp) + 0.5 * (ridge_factor @ beta**2).sum()
+        return -1.0 * val  # maximize the likelihood
+
+    beta_reduced, *_ = irls_solver(
+        counts=counts,
+        size_factors=size_factors,
+        design_matrix=reduced_design_matrix,
+        disp=disp,
+        min_mu=min_mu,
+        beta_tol=beta_tol,
+    )
+
+    reduced_ll = reg_nb_nll(beta_reduced, reduced_design_matrix, reduced_ridge_factor)
+    full_ll = reg_nb_nll(lfc, design_matrix, ridge_factor)
+
+    lrt_statistic = 2 * (full_ll - reduced_ll)
+    # df = 1 since contrast_idx is the only variable removed
+    lrt_p_value = chi2.sf(lrt_statistic, df=1)
+
+    print(lrt_p_value)
+    print(lrt_statistic)
+
+    return lrt_p_value, lrt_statistic
 
 def fit_rough_dispersions(
     normed_counts: np.ndarray, design_matrix: pd.DataFrame