add option for including motif rc version

yztxwd · yztxwd · commit b582606220b3 · 2025-10-20T14:15:31.000-04:00
diff --git a/scripts/run_tcav_sgd_pca.py b/scripts/run_tcav_sgd_pca.py
@@ -182,9 +182,9 @@ def make_predictions_and_save(avdl, name):
 def construct_motif_concept_dataloader_from_control(
     control_seq_bed_df,
     genome_fasta,
-    motifs:list,
+    motifs: list,
     num_motifs=128,
-    motif_mode='pwm',
+    motif_mode="pwm",
     start_buffer=0,
     end_buffer=0,
     batch_size=8,
@@ -343,6 +343,11 @@ def pair(arg):
     parser.add_argument(
         "--num-motifs", type=int, default=12, help="Number of motifs to insert"
     )
+    parser.add_argument(
+        "--include-reverse-complement",
+        action="store_true",
+        help="Use both forward and reverse complement version of motifs",
+    )
     parser.add_argument(
         "--num-samples-per-concept",
         type=int,
@@ -422,21 +427,29 @@ def pair(arg):
     concepts = []
     ## custom motifs, use the first control concept as a template
     if args.custom_motifs is not None:
-        df = pd.read_table(args.custom_motifs, names=['motif_name', 'consensus_seq'])
+        df = pd.read_table(args.custom_motifs, names=["motif_name", "consensus_seq"])
         for m in np.unique(df.motif_name):
             motif_name = m
-            consensus_seqs = df.loc[df.motif_name==m, 'consensus_seq'].tolist() # take all consensus seqs that correspond to the same motif name
+            consensus_seqs = df.loc[
+                df.motif_name == m, "consensus_seq"
+            ].tolist()  # take all consensus seqs that correspond to the same motif name
             motifs = []
             for i, c in enumerate(consensus_seqs):
                 motif = utils.CustomMotif(f"{m}_{i}", c)
                 motifs.append(motif)
+                if (
+                    args.include_reverse_complement
+                ):  # add reverse complement if specified
+                    motif_rc = motif.reverse_complement()
+                    motifs.append(motif_rc)
+
             cn = f"{motif_name}"
             seq_dl = construct_motif_concept_dataloader_from_control(
                 random_regions_df,
                 args.genome_fasta_file,
                 motifs=motifs,
                 num_motifs=args.num_motifs,
-                motif_mode='consensus',
+                motif_mode="consensus",
                 batch_size=BATCH_SIZE,
                 infinite=False,
             )
@@ -451,13 +464,18 @@ def pair(arg):
     if args.meme_motifs is not None:
         with open(args.meme_motifs) as f:
             for motif in Bio_motifs.parse(f, fmt="MINIMAL"):
+                motifs = []
+                motifs.append(motif)
+                if args.include_reverse_complement:
+                    motif_rc = motif.reverse_complement()
+                    motifs.append(motif_rc)
                 cn = f"{motif.name.replace('/', '-')}"
                 seq_dl = construct_motif_concept_dataloader_from_control(
                     random_regions_df,
                     args.genome_fasta_file,
-                    motifs=[motif],
+                    motifs=motifs,
                     num_motifs=args.num_motifs,
-                    motif_mode='pwm',
+                    motif_mode="pwm",
                     batch_size=BATCH_SIZE,
                     infinite=False,
                 )
@@ -528,7 +546,7 @@ def pair(arg):
     logger.info(concepts)
 
     # register hook
-    def get_activation(concept, num_samples=10):
+    def get_activation(concept, num_samples=args.num_samples_per_concept):
         avs = []
         num = 0
         for seq, chrom in concept.data_iter:
@@ -596,8 +614,9 @@ def get_activation(concept, num_samples=10):
     # set to eval mode for sanity
     model.eval()
 
-    def get_tpcav_activations(concept):
+    def get_tpcav_activations(concept, num_samples=args.num_samples):
         avs_pca = []
+        num = 0
         for seq, chrom in concept.data_iter:
             seq = utils.seq_transform_fn(seq)
             chrom = utils.chrom_transform_fn(chrom)
@@ -613,9 +632,13 @@ def get_tpcav_activations(concept):
             else:
                 av_pca = av_residual
             avs_pca.append(av_pca.detach().cpu())
+
+            num += av_pca.shape[0]
+            if num >= num_samples:
+                break
             with torch.no_grad():
                 del seq, av, av_projected, av_residual
-        return torch.cat(avs_pca).detach().cpu()
+        return torch.cat(avs_pca).detach().cpu()[:num_samples]
 
     # get activations of each concept and train classifier for each pair
     pool = Pool()
diff --git a/scripts/utils.py b/scripts/utils.py
@@ -6,14 +6,14 @@
 
 from itertools import cycle
 
+import Bio
 import numpy as np
 import pandas as pd
 import pyfaidx
 import seqchromloader as scl
 import torch
 from Bio import SeqIO
 from deeplift.dinuc_shuffle import dinuc_shuffle
-from models import ConvTowerDomain_v6
 from pybedtools import BedTool
 from pyfaidx import Fasta
 from seq_utils import insert_motif_into_seq, insert_region_into_seq
@@ -289,7 +289,10 @@ def center_windows(df, window_len=1024):
     df = df.assign(mid=lambda x: ((x["start"] + x["end"]) / 2).astype(int)).assign(
         start=lambda x: x["mid"] - halfR, end=lambda x: x["mid"] + halfR
     )
-    return df[["chrom", "start", "end"]]
+    if "strand" in df.columns:
+        return df[["chrom", "start", "end", "strand"]]
+    else:
+        return df[["chrom", "start", "end"]]
 
 
 def collate_seq(batch):
@@ -363,7 +366,8 @@ def seq_dataloader_from_dataframe(
 ):
     seq_df = center_windows(seq_df, window_len=window_len)
     seq_df["label"] = -1
-    seq_df["strand"] = "+"
+    if not "strand" in seq_df.columns:
+        seq_df["strand"] = "+"
     # print(f"Filtering out concept samples that don't exist in the genome...")
     seq_df = scl.filter_chromosomes(seq_df, to_keep=Fasta(genome_fasta).keys())
     dl = scl.SeqChromDatasetByDataFrame(
@@ -421,7 +425,8 @@ def chrom_dataloader_from_dataframe(
 ):
     chrom_df = center_windows(chrom_df, window_len=input_window_length)
     chrom_df["label"] = -1
-    chrom_df["strand"] = "+"
+    if not "strand" in chrom_df.columns:
+        chrom_df["strand"] = "+"
     # print(f"Filtering out concept samples that don't exist in the genome...")
     chrom_df = scl.filter_chromosomes(chrom_df, to_keep=Fasta(genome_fasta).keys())
     dl = scl.SeqChromDatasetByDataFrame(
@@ -475,7 +480,8 @@ def seq_dataloader(self):
             )
             seq_df = center_windows(seq_df, window_len=self.window_len)
             seq_df["label"] = -1
-            seq_df["strand"] = "+"
+            if not "strand" in seq_df.columns:
+                seq_df["strand"] = "+"
             # print(f"Filtering out concept samples that don't exist in the genome...")
             seq_df = scl.filter_chromosomes(
                 seq_df, to_keep=Fasta(self.genome_fasta).keys()
@@ -515,7 +521,8 @@ def chrom_dataloader(self):
         )
         chrom_df = center_windows(chrom_df, window_len=self.window_len)
         chrom_df["label"] = -1
-        chrom_df["strand"] = "+"
+        if not "strand" in chrom_df.columns:
+            chrom_df["strand"] = "+"
         # print(f"Filtering out concept samples that don't exist in the genome...")
         chrom_df = scl.filter_chromosomes(
             chrom_df, to_keep=Fasta(self.genome_fasta).keys()
@@ -610,4 +617,5 @@ def __len__(self):
 
     def reverse_complement(self):
         self.consensus = Bio.Seq.reverse_complement(self.consensus)
+        self.name = self.name + "_rc"
         return self