bbglab · FerriolCalvet · Mar 14, 2026 · Mar 14, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/bin/annotate_omega_failing.py b/bin/annotate_omega_failing.py
diff --git a/bin/check_contamination.py b/bin/check_contamination.py
@@ -41,20 +41,7 @@ def compute_shared_variants(somatic_variants, germline_variants):
 
 
 
-def contamination_detection(maf_file, somatic_maf_file):
-
-    maf_df = pd.read_table(maf_file, na_values=custom_na_values)
-    print(maf_df.shape)
-    maf_df = maf_df[~(maf_df["FILTER.not_covered"])
-                     & (maf_df["TYPE"] == 'SNV')
-                    ].reset_index()
-    print(maf_df.shape)
-
-    somatic_maf_df = pd.read_table(somatic_maf_file, na_values=custom_na_values)
-    print(somatic_maf_df.shape)
-    somatic_maf_df = somatic_maf_df[(somatic_maf_df["TYPE"] == 'SNV')]
-    print(somatic_maf_df.shape)
-
+def contamination_detection_between_samples(maf_df, somatic_maf_df):
 
     # this is if we were to consider both unique and no-unique variants
     vaf_threshold = 0.2
@@ -372,14 +359,69 @@ def contamination_detection(maf_file, somatic_maf_file):
 
 
 
+def data_loading(maf_path, somatic_maf_path):
+    maf_df = pd.read_table(maf_path, na_values=custom_na_values)
+    print(maf_df.shape)
+    maf_df = maf_df[~(maf_df["FILTER.not_covered"])
+                     & (maf_df["TYPE"] == 'SNV')
+                    ].reset_index()
+    print(maf_df.shape)
+
+    somatic_maf_df = pd.read_table(somatic_maf_path, na_values=custom_na_values)
+    print(somatic_maf_df.shape)
+    somatic_maf_df = somatic_maf_df[(somatic_maf_df["TYPE"] == 'SNV')]
+    print(somatic_maf_df.shape)
+    return maf_df, somatic_maf_df
+
+
+def contamination_detection_in_snps(maf):
+
+    snp_positions_maf = maf[maf["FILTER.gnomAD_SNP"]][
+        ["SAMPLE_ID", "MUT_ID", "VAF"]
+        ].reset_index(drop = True)
+
+    # being very restrictive in the VAF to count the occurrences of potentially contaminated mutations
+    somatic_snp_positions_maf = snp_positions_maf[snp_positions_maf["VAF"] < 0.05].reset_index(drop = True)
+    germline_snp_positions_maf = snp_positions_maf[snp_positions_maf["VAF"] >= 0.05].reset_index(drop = True)
+
+    unique_SNP_positions = snp_positions_maf["MUT_ID"].unique()
+    number_unique_SNP_positions = len(unique_SNP_positions)
+
+    sample_SNP_mutation_freq = []
+    for sample in snp_positions_maf["SAMPLE_ID"].unique():
+        germline_count = len(germline_snp_positions_maf[germline_snp_positions_maf["SAMPLE_ID"] == sample])
+        somatic_count = len(somatic_snp_positions_maf[somatic_snp_positions_maf["SAMPLE_ID"] == sample])
+        remaining_germline = number_unique_SNP_positions-germline_count
+        sample_SNP_mutation_freq.append([sample,
+                                         germline_count,
+                                         remaining_germline,
+                                         somatic_count,
+                                         somatic_count / remaining_germline if remaining_germline > 0 else 1
+                                         ])
+    sample_SNP_mutation_freq_df = pd.DataFrame(sample_SNP_mutation_freq)
+    sample_SNP_mutation_freq_df.columns = ["SAMPLE_ID", "germline_count", "remaining_germline", "somatic_count", "prop_somatic_SNPs"]
+
+    # identify outliers in the "prop_somatic_SNPs" column
+    sample_SNP_mutation_freq_df = sample_SNP_mutation_freq_df.sort_values(by = "prop_somatic_SNPs", ascending = False)
+    sample_SNP_mutation_freq_df.to_csv("sample_SNP_mutation_freq.tsv", header = True, sep = '\t', index = False)
+
+
+
 @click.command()
 @click.option('--maf_path', type=click.Path(exists=True), required=True, help='Path to the MAF file.')
 @click.option('--somatic_maf', type=click.Path(exists=True), required=True, help='Path to the filtered somatic mutations file.')
 def main(maf_path, somatic_maf):
     """
     CLI entry point for assessing contamination between samples using germline and somatic mutations.
     """
-    contamination_detection(maf_path, somatic_maf)
+
+    maf_df, somatic_maf_df = data_loading(maf_path, somatic_maf)
+
+    print("Running contamination analysis between samples")
+    contamination_detection_between_samples(maf_df, somatic_maf_df)
+
+    print("Running general contamination analysis")
+    contamination_detection_in_snps(maf_df)
 
 
 

diff --git a/bin/mut_profile.py b/bin/mut_profile.py
@@ -10,6 +10,17 @@
 from utils_plot import plot_profile
 from read_utils import custom_na_values
 
+def bayesian_update(observed_counts, prior_profile, limit=200):
+
+    N = np.sum(observed_counts).item()
+
+    if N >= limit:
+        return observed_counts
+    else:
+        prior_profile = prior_profile.set_index("CONTEXT_MUT")
+        posterior_matrix = ((limit - N) * prior_profile).values + observed_counts
+        return posterior_matrix
+
 
 def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method, pseudocount,
                             sigprofiler, per_sample):
@@ -98,7 +109,6 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method
                                             sep = "\t")
 
 
-
 def profile_stability(counts, denominator):
     """
     Compute stability score of a probability profile by
@@ -164,23 +174,28 @@ def profile_stability(counts, denominator):
     }
 
 
-def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_counts_file, plot,
-                                wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False):
+def compute_mutation_profile(sample_name, mutation_matrix, trinucleotide_counts_file, plot,
+                                wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False,
+                                smoothed = False, prior_profile_file = None,
+                                minimum_mutations = 200
+                                ):
     """
     Compute mutational profile from the input data
 
         Required information:
             Mutation matrix
             Trinucleotide content of the sequenced region (depth-aware or non-depth-aware)
+
+            Minimum number of mutations to apply Bayesian smoothing (if smoothed is active)
+                This has been set to 200 after proper testing of the minimum required entropy of mutational profiles.
         Output:
             Mutational profile per sample
     """
 
     # Load your mutation matrix
-    mutation_matrix = pd.read_csv(mutation_matrix_file, sep = "\t", header = 0)
     mutation_matrix = mutation_matrix.set_index("CONTEXT_MUT")
     total_mutations = np.sum(mutation_matrix[sample_name])
-
+    
     # proportion of SBS mutations per trinucleotide in panel
     mutation_matrix_proportions = mutation_matrix.copy()
     mutation_matrix_proportions[sample_name] = mutation_matrix_proportions[sample_name] / total_mutations
@@ -214,17 +229,6 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
     # normalize
     mut_probability = mut_probability / mut_probability.sum()
 
-    # if there is any channel with 0 probability we need to add a pseudocount
-    if not all(mut_probability[sample_name].values > 0):
-        # find the minimum value greater than 0
-        min_value_non_zero = mut_probability[mut_probability > 0].min()
-        # print(min_value_non_zero)
-
-        # add a dynamic pseudocount of one third of the minimum number greater than 0
-        mut_probability = mut_probability + (min_value_non_zero / 3)
-
-    mut_probability = mut_probability / mut_probability.sum()
-
 
     # reindex to ensure the right order
     mut_probability = mut_probability.reindex(contexts_formatted)
@@ -271,14 +275,40 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
                                     header = True,
                                     index = True,
                                     sep = "\t")
+
+        if total_mutations < minimum_mutations and smoothed:
+            print(f"Using a prior profile to smooth the profile, since the mutation count is < {minimum_mutations}")
+            prior_profile = pd.read_table(prior_profile_file)
+
+            upd_mutation_matrix_wgs = bayesian_update(profile_trinuc_clean, prior_profile, minimum_mutations).reset_index()
+            total_mutations = max(minimum_mutations, total_mutations)
+
+            # we have updated the mutation counts at the WGS level, we should now revert this update
+            # for the specific sample's trinucleotide content and depth
+
+            upd_mutation_matrix_wgs["CONTEXT"] = upd_mutation_matrix_wgs["CONTEXT_MUT"].apply( lambda x : x[:3])
+            profile_trinuc_merge = upd_mutation_matrix_wgs.merge(ref_trinuc32, on = "CONTEXT")
+
+            # relative mutability per trinucleotide change in the panel after smoothing
+            profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"] = profile_trinuc_merge[sample_name] / profile_trinuc_merge["COUNT"]
+            profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"] = profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"] / profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"].sum()
+
+            profile_n_sample_trinuc = profile_trinuc_merge.merge(trinucleotide_counts.reset_index(), on = "CONTEXT", suffixes = ("", "_PANEL"))
+            profile_n_sample_trinuc["MUTS_PANEL"] = profile_n_sample_trinuc[sample_name] * profile_n_sample_trinuc[f"{sample_name}_PANEL"]
+            profile_n_sample_trinuc["MUTS_PANEL_FINAL"] = profile_n_sample_trinuc["MUTS_PANEL"] / profile_n_sample_trinuc["MUTS_PANEL"].sum() * total_mutations
+            smoothed_mutation_matrix_panel_counts = profile_n_sample_trinuc[["CONTEXT_MUT", "MUTS_PANEL_FINAL"]]
+
+            return smoothed_mutation_matrix_panel_counts.rename({"MUTS_PANEL_FINAL": sample_name}, axis = 1)
+        else:
+            print(f"No need to smooth the profile, the mutation count is already >= {minimum_mutations}")
 
         profile_trinuc_clean_proportion = profile_trinuc_clean.copy()
         profile_trinuc_clean_proportion[sample_name] = profile_trinuc_clean_proportion[sample_name] / profile_trinuc_clean_proportion[sample_name].sum()
         profile_trinuc_clean_proportion.to_csv(f"{sample_name}.proportion_mutations.WGS.tsv",
                                                 header = True,
                                                 index = True,
                                                 sep = "\t")
-
+        
 
         # plot the profile as a percentage of SBS mutations seen after sequencing one WGS
         # if mutations were occuring with the same probabilities as they occur in our sequenced panel
@@ -295,7 +325,7 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
                                             index = False,
                                             sep = "\t")
 
-
+        return None
 
 
 
@@ -313,12 +343,14 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
 @click.option('--plot', is_flag=True, help='Generate plot and save as PDF')
 @click.option('--wgs', is_flag=True, help='Store matrix of mutation counts at WGS level')
 @click.option('--wgs_trinucleotide_counts', type=click.Path(exists=True), help='Trinucleotide counts file of the WGS (for profile mode if WGS active)')
+@click.option('--smoothed', is_flag=True, help='Apply Bayesian smoothing to the mutation counts using a prior profile')
+@click.option('--prior_profile', type=click.Path(exists=True), help='Prior profile file to use for Bayesian smoothing (required if --smoothed is set)')
 
 
 @click.option('--sigprofiler', is_flag=True, help='Store the index column using the SigProfiler format')
 
 def main(mode, sample_name, mut_file, out_matrix, method, pseud, sigprofiler, per_sample, mutation_matrix,
-            trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts):
+            trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, smoothed, prior_profile):
 
     if mode == 'matrix':
         click.echo(f"Running in matrix mode...")
@@ -328,7 +360,11 @@ def main(mode, sample_name, mut_file, out_matrix, method, pseud, sigprofiler, pe
 
     elif mode == 'profile':
         click.echo(f"Running in profile mode...")
-        compute_mutation_profile(sample_name, mutation_matrix, trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, sigprofiler)
+        mutation_matrix_loaded = pd.read_csv(mutation_matrix, sep = "\t", header = 0)
+        smoothed_mutation_matrix = compute_mutation_profile(sample_name, mutation_matrix_loaded, trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, sigprofiler, smoothed, prior_profile)
+        if smoothed_mutation_matrix is not None:
+            compute_mutation_profile(sample_name, smoothed_mutation_matrix, trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, sigprofiler)
+
         click.echo("Profile computation completed.")
 
     else:

diff --git a/conf/modules.config b/conf/modules.config
@@ -164,6 +164,10 @@ process {
         ext.args = "--level ${params.confidence_level}"
     }
 
+    withName: COMPUTEPROFILE {
+        ext.smoothing = params.profile_smoothing
+    }
+
     withName: MAF2VCF {
         ext.args   = "--output-dir . --maf-from-deepcsa --sample-name-column SAMPLE_ID"
         publishDir = [

diff --git a/modules/local/compute_profile/main.nf b/modules/local/compute_profile/main.nf
@@ -7,8 +7,9 @@ process COMPUTE_PROFILE {
     label 'deepcsa_core'
 
     input:
-    tuple val(meta), path(matrix), path(trinucleotide)
+    tuple val(meta) , path(matrix), path(trinucleotide)
     path( wgs_trinucleotides )
+    tuple val(meta2), path( cohort_profile , stageAs: 'global_mutprofile.tsv')
 
     output:
     tuple val(meta), path("*.profile.tsv")                                  , emit: profile
@@ -28,12 +29,14 @@ process COMPUTE_PROFILE {
     def prefix = task.ext.prefix ?: ""
     prefix = "${meta.id}${prefix}"
     def wgs_trinuc = wgs_trinucleotides ? "--wgs --wgs_trinucleotide_counts ${wgs_trinucleotides}" : ""
+    def smoothing_args = task.ext.smoothing ? "--smoothed --prior_profile ${cohort_profile}" : ""
     """
     mut_profile.py profile \\
                     --sample_name ${prefix} \\
                     --mutation_matrix ${matrix} \\
                     --trinucleotide_counts ${trinucleotide} \\
                     ${wgs_trinuc} \\
+                    ${smoothing_args} \\
                     ${args}
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -40,6 +40,7 @@ params {
     profilenonprot                           = false
     profileexons                             = false
     profileintrons                           = false
+    profile_smoothing                        = false
     positive_selection_non_protein_affecting = false
 
     oncodrivefml                             = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -580,6 +580,11 @@
                     "type": "boolean",
                     "description": "Do you want to run the profile for intron mutations only?",
                     "fa_icon": "fas fa-book"
+                },
+                "profile_smoothing": {
+                    "type": "boolean",
+                    "description": "Do you want to apply smoothing to the mutational profile when having low numbers of mutations?",
+                    "fa_icon": "fas fa-book"
                 }
             }
         },

diff --git a/subworkflows/local/mutationprofile/main.nf b/subworkflows/local/mutationprofile/main.nf
@@ -7,6 +7,7 @@ include { COMPUTE_MATRIX            as COMPUTEMATRIX            } from '../../..
 include { COMPUTE_TRINUCLEOTIDE     as COMPUTETRINUC            } from '../../../modules/local/compute_trinucleotide/main'
 
 include { COMPUTE_PROFILE           as COMPUTEPROFILE           } from '../../../modules/local/compute_profile/main'
+include { COMPUTE_PROFILE           as COMPUTEPROFILECOHORT     } from '../../../modules/local/compute_profile/main'
 include { CONCAT_PROFILES           as CONCATPROFILES           } from '../../../modules/local/concatprofiles/main'
 
 
@@ -40,7 +41,19 @@ workflow MUTATIONAL_PROFILE {
     .join(COMPUTETRINUC.out.trinucleotides)
     .set{ matrix_n_trinucleotide }
 
-    COMPUTEPROFILE(matrix_n_trinucleotide, wgs_trinuc)
+    dummy_file = wgs_trinuc.map{ it -> [ [ id: "dummy_file" ], it ]  }
+
+    if (params.profile_smoothing) {
+        matrix_n_trinucleotide
+        .join( channel.of([ [ id: "all_samples" ] ]) )
+        .set{ matrix_n_trinucleotide_all }
+
+        COMPUTEPROFILECOHORT(matrix_n_trinucleotide_all, wgs_trinuc, dummy_file)
+        cohort_profile = COMPUTEPROFILECOHORT.out.wgs_proportions.first()
+    } else {
+        cohort_profile = dummy_file
+    }
+    COMPUTEPROFILE(matrix_n_trinucleotide, wgs_trinuc, cohort_profile)
 
     sigprofiler_empty = channel.of([])
     sigprofiler_empty