Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
292 changes: 115 additions & 177 deletions bin/annotate_omega_failing.py

Large diffs are not rendered by default.

72 changes: 57 additions & 15 deletions bin/check_contamination.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,7 @@ def compute_shared_variants(somatic_variants, germline_variants):



def contamination_detection(maf_file, somatic_maf_file):

maf_df = pd.read_table(maf_file, na_values=custom_na_values)
print(maf_df.shape)
maf_df = maf_df[~(maf_df["FILTER.not_covered"])
& (maf_df["TYPE"] == 'SNV')
].reset_index()
print(maf_df.shape)

somatic_maf_df = pd.read_table(somatic_maf_file, na_values=custom_na_values)
print(somatic_maf_df.shape)
somatic_maf_df = somatic_maf_df[(somatic_maf_df["TYPE"] == 'SNV')]
print(somatic_maf_df.shape)

def contamination_detection_between_samples(maf_df, somatic_maf_df):

# this is if we were to consider both unique and no-unique variants
vaf_threshold = 0.2
Expand Down Expand Up @@ -372,14 +359,69 @@ def contamination_detection(maf_file, somatic_maf_file):



def data_loading(maf_path, somatic_maf_path):
maf_df = pd.read_table(maf_path, na_values=custom_na_values)
print(maf_df.shape)
maf_df = maf_df[~(maf_df["FILTER.not_covered"])
& (maf_df["TYPE"] == 'SNV')
].reset_index()
print(maf_df.shape)

somatic_maf_df = pd.read_table(somatic_maf_path, na_values=custom_na_values)
print(somatic_maf_df.shape)
somatic_maf_df = somatic_maf_df[(somatic_maf_df["TYPE"] == 'SNV')]
print(somatic_maf_df.shape)
return maf_df, somatic_maf_df


def contamination_detection_in_snps(maf):

snp_positions_maf = maf[maf["FILTER.gnomAD_SNP"]][
["SAMPLE_ID", "MUT_ID", "VAF"]
].reset_index(drop = True)

# being very restrictive in the VAF to count the occurrences of potentially contaminated mutations
somatic_snp_positions_maf = snp_positions_maf[snp_positions_maf["VAF"] < 0.05].reset_index(drop = True)
germline_snp_positions_maf = snp_positions_maf[snp_positions_maf["VAF"] >= 0.05].reset_index(drop = True)

unique_SNP_positions = snp_positions_maf["MUT_ID"].unique()
number_unique_SNP_positions = len(unique_SNP_positions)

sample_SNP_mutation_freq = []
for sample in snp_positions_maf["SAMPLE_ID"].unique():
germline_count = len(germline_snp_positions_maf[germline_snp_positions_maf["SAMPLE_ID"] == sample])
somatic_count = len(somatic_snp_positions_maf[somatic_snp_positions_maf["SAMPLE_ID"] == sample])
remaining_germline = number_unique_SNP_positions-germline_count
sample_SNP_mutation_freq.append([sample,
germline_count,
remaining_germline,
somatic_count,
somatic_count / remaining_germline if remaining_germline > 0 else 1
])
sample_SNP_mutation_freq_df = pd.DataFrame(sample_SNP_mutation_freq)
sample_SNP_mutation_freq_df.columns = ["SAMPLE_ID", "germline_count", "remaining_germline", "somatic_count", "prop_somatic_SNPs"]

# identify outliers in the "prop_somatic_SNPs" column
sample_SNP_mutation_freq_df = sample_SNP_mutation_freq_df.sort_values(by = "prop_somatic_SNPs", ascending = False)
sample_SNP_mutation_freq_df.to_csv("sample_SNP_mutation_freq.tsv", header = True, sep = '\t', index = False)



@click.command()
@click.option('--maf_path', type=click.Path(exists=True), required=True, help='Path to the MAF file.')
@click.option('--somatic_maf', type=click.Path(exists=True), required=True, help='Path to the filtered somatic mutations file.')
def main(maf_path, somatic_maf):
"""
CLI entry point for assessing contamination between samples using germline and somatic mutations.
"""
contamination_detection(maf_path, somatic_maf)

maf_df, somatic_maf_df = data_loading(maf_path, somatic_maf)

print("Running contamination analysis between samples")
contamination_detection_between_samples(maf_df, somatic_maf_df)

print("Running general contamination analysis")
contamination_detection_in_snps(maf_df)



Expand Down
76 changes: 56 additions & 20 deletions bin/mut_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@
from utils_plot import plot_profile
from read_utils import custom_na_values

def bayesian_update(observed_counts, prior_profile, limit=200):

N = np.sum(observed_counts).item()

if N >= limit:
return observed_counts
else:
prior_profile = prior_profile.set_index("CONTEXT_MUT")
posterior_matrix = ((limit - N) * prior_profile).values + observed_counts
return posterior_matrix


def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method, pseudocount,
sigprofiler, per_sample):
Expand Down Expand Up @@ -98,7 +109,6 @@ def compute_mutation_matrix(sample_name, mutations_file, mutation_matrix, method
sep = "\t")



def profile_stability(counts, denominator):
"""
Compute stability score of a probability profile by
Expand Down Expand Up @@ -164,23 +174,28 @@ def profile_stability(counts, denominator):
}


def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_counts_file, plot,
wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False):
def compute_mutation_profile(sample_name, mutation_matrix, trinucleotide_counts_file, plot,
wgs = False, wgs_trinucleotide_counts = False, sigprofiler = False,
smoothed = False, prior_profile_file = None,
minimum_mutations = 200
):
"""
Compute mutational profile from the input data

Required information:
Mutation matrix
Trinucleotide content of the sequenced region (depth-aware or non-depth-aware)

Minimum number of mutations to apply Bayesian smoothing (if smoothed is active)
This has been set to 200 after proper testing of the minimum required entropy of mutational profiles.
Output:
Mutational profile per sample
"""

# Load your mutation matrix
mutation_matrix = pd.read_csv(mutation_matrix_file, sep = "\t", header = 0)
mutation_matrix = mutation_matrix.set_index("CONTEXT_MUT")
total_mutations = np.sum(mutation_matrix[sample_name])

# proportion of SBS mutations per trinucleotide in panel
mutation_matrix_proportions = mutation_matrix.copy()
mutation_matrix_proportions[sample_name] = mutation_matrix_proportions[sample_name] / total_mutations
Expand Down Expand Up @@ -214,17 +229,6 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
# normalize
mut_probability = mut_probability / mut_probability.sum()

# if there is any channel with 0 probability we need to add a pseudocount
if not all(mut_probability[sample_name].values > 0):
# find the minimum value greater than 0
min_value_non_zero = mut_probability[mut_probability > 0].min()
# print(min_value_non_zero)

# add a dynamic pseudocount of one third of the minimum number greater than 0
mut_probability = mut_probability + (min_value_non_zero / 3)

mut_probability = mut_probability / mut_probability.sum()


# reindex to ensure the right order
mut_probability = mut_probability.reindex(contexts_formatted)
Expand Down Expand Up @@ -271,14 +275,40 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
header = True,
index = True,
sep = "\t")

if total_mutations < minimum_mutations and smoothed:
print(f"Using a prior profile to smooth the profile, since the mutation count is < {minimum_mutations}")
prior_profile = pd.read_table(prior_profile_file)

upd_mutation_matrix_wgs = bayesian_update(profile_trinuc_clean, prior_profile, minimum_mutations).reset_index()
total_mutations = max(minimum_mutations, total_mutations)

# we have updated the mutation counts at the WGS level, we should now revert this update
# for the specific sample's trinucleotide content and depth

upd_mutation_matrix_wgs["CONTEXT"] = upd_mutation_matrix_wgs["CONTEXT_MUT"].apply( lambda x : x[:3])
profile_trinuc_merge = upd_mutation_matrix_wgs.merge(ref_trinuc32, on = "CONTEXT")

# relative mutability per trinucleotide change in the panel after smoothing
profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"] = profile_trinuc_merge[sample_name] / profile_trinuc_merge["COUNT"]
profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"] = profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"] / profile_trinuc_merge["RELATIVE_MUTABILITY_PER_CHANNEL"].sum()

profile_n_sample_trinuc = profile_trinuc_merge.merge(trinucleotide_counts.reset_index(), on = "CONTEXT", suffixes = ("", "_PANEL"))
profile_n_sample_trinuc["MUTS_PANEL"] = profile_n_sample_trinuc[sample_name] * profile_n_sample_trinuc[f"{sample_name}_PANEL"]
profile_n_sample_trinuc["MUTS_PANEL_FINAL"] = profile_n_sample_trinuc["MUTS_PANEL"] / profile_n_sample_trinuc["MUTS_PANEL"].sum() * total_mutations
smoothed_mutation_matrix_panel_counts = profile_n_sample_trinuc[["CONTEXT_MUT", "MUTS_PANEL_FINAL"]]

return smoothed_mutation_matrix_panel_counts.rename({"MUTS_PANEL_FINAL": sample_name}, axis = 1)
else:
print(f"No need to smooth the profile, the mutation count is already >= {minimum_mutations}")

profile_trinuc_clean_proportion = profile_trinuc_clean.copy()
profile_trinuc_clean_proportion[sample_name] = profile_trinuc_clean_proportion[sample_name] / profile_trinuc_clean_proportion[sample_name].sum()
profile_trinuc_clean_proportion.to_csv(f"{sample_name}.proportion_mutations.WGS.tsv",
header = True,
index = True,
sep = "\t")


# plot the profile as a percentage of SBS mutations seen after sequencing one WGS
# if mutations were occuring with the same probabilities as they occur in our sequenced panel
Expand All @@ -295,7 +325,7 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
index = False,
sep = "\t")


return None



Expand All @@ -313,12 +343,14 @@ def compute_mutation_profile(sample_name, mutation_matrix_file, trinucleotide_co
@click.option('--plot', is_flag=True, help='Generate plot and save as PDF')
@click.option('--wgs', is_flag=True, help='Store matrix of mutation counts at WGS level')
@click.option('--wgs_trinucleotide_counts', type=click.Path(exists=True), help='Trinucleotide counts file of the WGS (for profile mode if WGS active)')
@click.option('--smoothed', is_flag=True, help='Apply Bayesian smoothing to the mutation counts using a prior profile')
@click.option('--prior_profile', type=click.Path(exists=True), help='Prior profile file to use for Bayesian smoothing (required if --smoothed is set)')


@click.option('--sigprofiler', is_flag=True, help='Store the index column using the SigProfiler format')

def main(mode, sample_name, mut_file, out_matrix, method, pseud, sigprofiler, per_sample, mutation_matrix,
trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts):
trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, smoothed, prior_profile):

if mode == 'matrix':
click.echo(f"Running in matrix mode...")
Expand All @@ -328,7 +360,11 @@ def main(mode, sample_name, mut_file, out_matrix, method, pseud, sigprofiler, pe

elif mode == 'profile':
click.echo(f"Running in profile mode...")
compute_mutation_profile(sample_name, mutation_matrix, trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, sigprofiler)
mutation_matrix_loaded = pd.read_csv(mutation_matrix, sep = "\t", header = 0)
smoothed_mutation_matrix = compute_mutation_profile(sample_name, mutation_matrix_loaded, trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, sigprofiler, smoothed, prior_profile)
if smoothed_mutation_matrix is not None:
compute_mutation_profile(sample_name, smoothed_mutation_matrix, trinucleotide_counts, plot, wgs, wgs_trinucleotide_counts, sigprofiler)

click.echo("Profile computation completed.")

else:
Expand Down
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ process {
ext.args = "--level ${params.confidence_level}"
}

withName: COMPUTEPROFILE {
ext.smoothing = params.profile_smoothing
}

withName: MAF2VCF {
ext.args = "--output-dir . --maf-from-deepcsa --sample-name-column SAMPLE_ID"
publishDir = [
Expand Down
5 changes: 4 additions & 1 deletion modules/local/compute_profile/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ process COMPUTE_PROFILE {
label 'deepcsa_core'

input:
tuple val(meta), path(matrix), path(trinucleotide)
tuple val(meta) , path(matrix), path(trinucleotide)
path( wgs_trinucleotides )
tuple val(meta2), path( cohort_profile , stageAs: 'global_mutprofile.tsv')

output:
tuple val(meta), path("*.profile.tsv") , emit: profile
Expand All @@ -28,12 +29,14 @@ process COMPUTE_PROFILE {
def prefix = task.ext.prefix ?: ""
prefix = "${meta.id}${prefix}"
def wgs_trinuc = wgs_trinucleotides ? "--wgs --wgs_trinucleotide_counts ${wgs_trinucleotides}" : ""
def smoothing_args = task.ext.smoothing ? "--smoothed --prior_profile ${cohort_profile}" : ""
"""
mut_profile.py profile \\
--sample_name ${prefix} \\
--mutation_matrix ${matrix} \\
--trinucleotide_counts ${trinucleotide} \\
${wgs_trinuc} \\
${smoothing_args} \\
${args}
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ params {
profilenonprot = false
profileexons = false
profileintrons = false
profile_smoothing = false
positive_selection_non_protein_affecting = false

oncodrivefml = false
Expand Down
5 changes: 5 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,11 @@
"type": "boolean",
"description": "Do you want to run the profile for intron mutations only?",
"fa_icon": "fas fa-book"
},
"profile_smoothing": {
"type": "boolean",
"description": "Do you want to apply smoothing to the mutational profile when having low numbers of mutations?",
"fa_icon": "fas fa-book"
}
}
},
Expand Down
15 changes: 14 additions & 1 deletion subworkflows/local/mutationprofile/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ include { COMPUTE_MATRIX as COMPUTEMATRIX } from '../../..
include { COMPUTE_TRINUCLEOTIDE as COMPUTETRINUC } from '../../../modules/local/compute_trinucleotide/main'

include { COMPUTE_PROFILE as COMPUTEPROFILE } from '../../../modules/local/compute_profile/main'
include { COMPUTE_PROFILE as COMPUTEPROFILECOHORT } from '../../../modules/local/compute_profile/main'
include { CONCAT_PROFILES as CONCATPROFILES } from '../../../modules/local/concatprofiles/main'


Expand Down Expand Up @@ -40,7 +41,19 @@ workflow MUTATIONAL_PROFILE {
.join(COMPUTETRINUC.out.trinucleotides)
.set{ matrix_n_trinucleotide }

COMPUTEPROFILE(matrix_n_trinucleotide, wgs_trinuc)
dummy_file = wgs_trinuc.map{ it -> [ [ id: "dummy_file" ], it ] }

if (params.profile_smoothing) {
matrix_n_trinucleotide
.join( channel.of([ [ id: "all_samples" ] ]) )
.set{ matrix_n_trinucleotide_all }

COMPUTEPROFILECOHORT(matrix_n_trinucleotide_all, wgs_trinuc, dummy_file)
cohort_profile = COMPUTEPROFILECOHORT.out.wgs_proportions.first()
} else {
cohort_profile = dummy_file
}
COMPUTEPROFILE(matrix_n_trinucleotide, wgs_trinuc, cohort_profile)

sigprofiler_empty = channel.of([])
sigprofiler_empty
Expand Down