Skip to content

Commit 6fe16dc

Browse files
committed
Sort reads with variants
1 parent f73fa0f commit 6fe16dc

1 file changed

Lines changed: 51 additions & 9 deletions

File tree

src/smvplot/visualize.py

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ def get_args():
7777
default="./", help='subfolder for the plots')
7878
argument_parser.add_argument('--out_format', metavar='STR', type=str,
7979
default='pdf', required=False, help='Output format of the plot, [default = %(default)s]')
80+
argument_parser.add_argument('--ref_base', metavar='STR', type=str,
81+
default='', required=False, help='Reference base for the variant entry, [default = %(default)s]')
82+
argument_parser.add_argument('--alt_base', metavar='STR', type=str,
83+
default='', required=False, help='Alternate base for the variant entry, [default = %(default)s]')
8084

8185

8286
parsed_arguments = argument_parser.parse_args()
@@ -286,7 +290,7 @@ def parse_cigar( cigar, pos, r_left, r_right):
286290

287291
# Iterate over the cigar structure
288292
for n,t in cigar_struct:
289-
293+
290294
# If the cigar is a match or a soft clipped, then add the position to the list
291295
if t in ['M','S']:
292296

@@ -315,7 +319,7 @@ def parse_cigar( cigar, pos, r_left, r_right):
315319

316320
rel_pos += n
317321

318-
elif t == "N":
322+
elif t == "N":
319323
for m in range(n):
320324
if r_left <= abs_pos+m <= r_right:
321325
cigar_struct2.append((t, abs_pos+m, rel_pos))
@@ -390,14 +394,52 @@ def plot_histogram( cigars, ax, ax_id):
390394

391395

392396

393-
def plot_cigars( cigars, sequences, reverses, ax, reference_function ):
394-
397+
def plot_cigars(cigars, sequences, reverses, ax, reference_function, region_center, region_chrom):
395398
patches = []
396399

397400
right_limits = [0] * len(cigars)
398401
basepair_colors = { 'A':"#009600", 'C':"#3030fe", 'G':"#d17105", 'T':"#ff0000", 'N':"#00ffff" }
399402

400-
for cigar,sequence,reverse in zip(cigars,sequences,reverses):
403+
# Sort cigars based on non-reference variants at the region center
404+
# The cigar list has the following structure: [ (cigar_type, absolute_position, relative_position), ... ]
405+
# The cigar_type can be 'M' (match), 'I' (insertion), 'D' (deletion), 'S' (soft clip), 'N' (skipped region)
406+
# The absolute_position is the position in the reference genome
407+
# The relative_position is the position in the read sequence
408+
# The cigar list is sorted based on the non-reference variants at the region center
409+
# The region center is the position of the variant in the reference genome
410+
# The reference_function is the reference genome sequence
411+
# Now separated reads based on the non-reference variants at the region center to group similar reads together
412+
# Create two lists: one for the reads that have the non-reference variant at the region center and one for the reads that do not have the non-reference variant at the region center
413+
414+
cigar_with_variants = []
415+
cigar_without_variants = []
416+
seq_order_with_variants = []
417+
seq_order_without_variants = []
418+
419+
for i, (read_cigar, sequence) in enumerate(zip(cigars, sequences)):
420+
# Check if the region center is in the read
421+
for position_cigar in read_cigar:
422+
c_type, abs_pos, rel_pos = position_cigar
423+
if abs_pos == region_center:
424+
if (c_type == 'M' and sequence[rel_pos] != reference_function[abs_pos]) and sequence[rel_pos] == parsed_arguments.alt_base:
425+
cigar_with_variants.append(read_cigar)
426+
seq_order_with_variants.append(i)
427+
break
428+
elif abs_pos == (region_center + 1) and (len(parsed_arguments.alt_base) > 1 or len(parsed_arguments.ref_base) > 1):
429+
if c_type == 'I' or c_type == 'D':
430+
cigar_with_variants.append(read_cigar)
431+
seq_order_with_variants.append(i)
432+
break
433+
else:
434+
cigar_without_variants.append(read_cigar)
435+
seq_order_without_variants.append(i)
436+
437+
438+
# Sort the reads with the non-reference variant at the region center based on the non-reference variant
439+
cigars_sorted = cigar_with_variants + cigar_without_variants
440+
sequences_sorted = [sequences[i] for i in seq_order_with_variants] + [sequences[i] for i in seq_order_without_variants]
441+
442+
for cigar, sequence, reverse in zip(cigars_sorted, sequences_sorted, reverses):
401443

402444
for j in range(len(cigars)):
403445
if right_limits[j] <= cigar[0][1]:
@@ -531,7 +573,7 @@ def plot_region(region_chrom, region_center, region_left, region_right, plot_tit
531573
plot_cigars([parse_cigar(read[5], int(read[3]), region_left, region_right) for read in samtools_read_plot if read[5] != "*"],
532574
[read[9] for read in samtools_read_plot if read[5] != "*"],
533575
[bool(int(read[1]) & 0x10) for read in samtools_read_plot if read[5] != "*"],
534-
ax[idx * 2 + 1], reference_buffer)
576+
ax[idx * 2 + 1], reference_buffer, region_center, region_chrom)
535577

536578
plot_title_vaf = plot_title
537579
if parsed_arguments.vaf:
@@ -623,9 +665,9 @@ def main():
623665
region_left = region_center - parsed_arguments.window
624666
region_right = region_center + parsed_arguments.window
625667

626-
plot_title = "%s:%s" % ( region_chrom, region_center )
668+
plot_title = "%s:%s-%s>%s" % ( region_chrom, region_center, parsed_arguments.ref_base, parsed_arguments.alt_base )
627669

628-
print(region_chrom, region_left, region_center, region_right)
670+
#print(region_chrom, region_left, region_center, region_right)
629671

630672
plot_region( region_chrom, region_center, region_left, region_right, plot_title )
631673

@@ -670,7 +712,7 @@ def main():
670712
line.split('\t')[ vcf_columns["Tumor_VAF"] ],
671713
line.split('\t')[ vcf_columns["RNA_VARIANT_AF"] ],
672714
]
673-
715+
674716
plot_region( region_chrom, region_center, region_left, region_right, plot_title, VAFs)
675717
plot_output_file = os.path.join(parsed_arguments.plot_dir, ("%s_%s_%i.%s" %(parsed_arguments.prefix, region_chrom, region_center, parsed_arguments.out_format)))
676718
plot.savefig( plot_output_file )

0 commit comments

Comments
 (0)