Skip to content

Commit f90b5c8

Browse files
committed
add package scripts for pip install
1 parent 90aa926 commit f90b5c8

10 files changed

Lines changed: 2103 additions & 6 deletions

File tree

.gitignore

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1-
data/hg19.fa
2-
data/checkpoint*
3-
**/__pycache__/
4-
**/.ipynb_checkpoints/
5-
cavs_test/
6-
output/
1+
*
2+
!data/motif-clustering-v2.1beta_consensus_pwms.meme
3+
!tpcav/
4+
!tpcav/*.py
5+
!scripts/
6+
!scripts/*.py
7+
!test/
8+
!test/*.py
9+
!pyproject.toml
10+
!LICENSE
11+
!README.md

pyproject.toml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[build-system]
2+
requires = ["setuptools>=61.0"]
3+
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
name = "tpcav"
7+
version = "0.1.0"
8+
description = "Testing with PCA projected Concept Activation Vectors"
9+
authors = [{name = "Jianyu Yang", email = "yztxwd@gmail.com"},]
10+
readme = "README.md"
11+
requires-python = ">=3.8"
12+
dependencies = [
13+
"torch",
14+
"pandas",
15+
"numpy",
16+
"seqchromloader",
17+
"deeplift",
18+
"pyfaidx",
19+
"pybedtools",
20+
"captum",
21+
"scikit-learn",
22+
"biopython",
23+
"seaborn",
24+
"matplotlib",
25+
]
26+
license = "MIT AND (Apache-2.0 OR BSD-2-Clause)"
27+
keywords = ["interpretation", "attribution", "concept", "genomics", "deep learning"]
28+
29+
[project.urls]
30+
Homepage = "https://github.com/seqcode/TPCAV"
31+
32+
[tool.setuptools.packages.find]
33+
exclude = ["data", "model", "node_modules", "test", "scripts"]

test/test_cav_trainer.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
import unittest
2+
from pathlib import Path
3+
4+
import torch
5+
from Bio import motifs as Bio_motifs
6+
7+
from tpcav import helper
8+
from tpcav.cavs import CavTrainer
9+
from tpcav.concepts import ConceptBuilder
10+
from tpcav.tpcav_model import TPCAV
11+
12+
13+
class DummyModelSeq(torch.nn.Module):
14+
def __init__(self):
15+
super().__init__()
16+
self.layer1 = torch.nn.Linear(1024, 1)
17+
self.layer2 = torch.nn.Linear(4, 1)
18+
19+
def forward(self, seq):
20+
y_hat = self.layer1(seq)
21+
y_hat = y_hat.squeeze(-1)
22+
y_hat = self.layer2(y_hat)
23+
return y_hat
24+
25+
26+
class DummyModelSeqChrom(torch.nn.Module):
27+
def __init__(self):
28+
super().__init__()
29+
self.layer1 = torch.nn.Linear(1024, 1)
30+
self.layer2 = torch.nn.Linear(4, 1)
31+
32+
def forward(self, seq, chrom):
33+
y_hat = self.layer1(seq)
34+
y_hat = y_hat.squeeze(-1)
35+
y_hat = self.layer2(y_hat)
36+
return y_hat
37+
38+
39+
def transform_fasta_to_one_hot_seq(seq, chrom):
40+
return (helper.fasta_to_one_hot_sequences(seq),)
41+
42+
43+
class CavTrainerIntegrationTest(unittest.TestCase):
44+
45+
def test_motif_concepts(self):
46+
motif_path = Path("data") / "motif-clustering-v2.1beta_consensus_pwms.test.meme"
47+
self.assertTrue(motif_path.exists(), "Motif file is missing")
48+
49+
builder = ConceptBuilder(
50+
genome_fasta="data/hg38.analysisSet.fa",
51+
genome_size_file="data/hg38.analysisSet.fa.fai",
52+
input_window_length=1024,
53+
bws=None,
54+
num_motifs=16,
55+
include_reverse_complement=True,
56+
min_samples=1000,
57+
batch_size=8,
58+
)
59+
60+
builder.build_control()
61+
62+
builder.add_meme_motif_concepts(str(motif_path))
63+
64+
# load motifs
65+
motifs = Bio_motifs.parse(open(motif_path), fmt="minimal")
66+
67+
for motif in motifs:
68+
motif_name = motif.name.replace("/", "-")
69+
70+
concept = None
71+
for c in builder.concepts:
72+
if c.name == motif_name:
73+
concept = c
74+
break
75+
76+
self.assertIsNotNone(concept)
77+
78+
seq, chrom = next(iter(concept.data_iter))
79+
80+
matches = list(motif.pssm.search(seq[0], threshold=2.0))
81+
82+
self.assertGreaterEqual(
83+
len(matches),
84+
16,
85+
f"Motif concept {motif_name} has insufficient matches {matches}",
86+
)
87+
88+
control_seq, _ = next(iter(builder.control_concepts[0].data_iter))
89+
90+
control_matches = list(motif.pssm.search(control_seq[0], threshold=2.0))
91+
92+
self.assertGreater(
93+
len(matches),
94+
len(control_matches),
95+
f"Control concept has more motif matches than Motif concept, motif concept: {len(matches)}, control concept: {len(control_matches)}",
96+
)
97+
98+
def test_all(self):
99+
100+
motif_path = Path("data") / "motif-clustering-v2.1beta_consensus_pwms.test.meme"
101+
self.assertTrue(motif_path.exists(), "Motif file is missing")
102+
103+
builder = ConceptBuilder(
104+
genome_fasta="data/hg38.analysisSet.fa",
105+
genome_size_file="data/hg38.analysisSet.fa.fai",
106+
input_window_length=1024,
107+
bws=None,
108+
num_motifs=12,
109+
include_reverse_complement=True,
110+
min_samples=1000,
111+
batch_size=8,
112+
)
113+
114+
builder.build_control()
115+
116+
builder.add_meme_motif_concepts(str(motif_path))
117+
118+
builder.apply_transform(transform_fasta_to_one_hot_seq)
119+
120+
batch = next(iter(builder.all_concepts()[0].data_iter))
121+
122+
self.assertTupleEqual(batch[0].shape, (builder.batch_size, 4, 1024))
123+
124+
tpcav_model = TPCAV(DummyModelSeq(), layer_name="layer1")
125+
tpcav_model.fit_pca(
126+
concepts=builder.all_concepts(),
127+
num_samples_per_concept=10,
128+
num_pc="full",
129+
)
130+
torch.save(tpcav_model, "data/tmp_tpcav_model.pt")
131+
132+
cav_trainer = CavTrainer(tpcav_model, penalty="l2")
133+
cav_trainer.set_control(builder.control_concepts[0], num_samples=100)
134+
135+
cav_trainer.train_concepts(
136+
builder.concepts, 100, output_dir="data/cavs/", num_processes=2
137+
)
138+
torch.save(cav_trainer, "data/tmp_cav_trainer.pt")
139+
140+
random_regions_1 = helper.random_regions_dataframe(
141+
"data/hg38.analysisSet.fa.fai", 1024, 100, seed=1
142+
)
143+
random_regions_2 = helper.random_regions_dataframe(
144+
"data/hg38.analysisSet.fa.fai", 1024, 100, seed=2
145+
)
146+
147+
def pack_data_iters(df):
148+
seq_fasta_iter = helper.dataframe_to_fasta_iter(
149+
df, "data/hg38.analysisSet.fa", batch_size=8
150+
)
151+
seq_one_hot_iter = (
152+
helper.fasta_to_one_hot_sequences(seq_fasta)
153+
for seq_fasta in seq_fasta_iter
154+
)
155+
chrom_iter = helper.dataframe_to_chrom_tracks_iter(df, None, batch_size=8)
156+
return zip(
157+
seq_one_hot_iter,
158+
)
159+
160+
attributions = tpcav_model.layer_attributions(
161+
pack_data_iters(random_regions_1), pack_data_iters(random_regions_2)
162+
)["attributions"]
163+
164+
cav_trainer.tpcav_score("AC0001:GATA-PROP:GATA", attributions)
165+
166+
cav_trainer.plot_cavs_similaritiy_heatmap(attributions)
167+
168+
input_attrs = tpcav_model.input_attributions(
169+
pack_data_iters(random_regions_1),
170+
pack_data_iters(random_regions_2),
171+
multiply_by_inputs=True,
172+
cavs_list=[
173+
cav_trainer.cav_weights["AC0001:GATA-PROP:GATA"],
174+
],
175+
)
176+
177+
178+
if __name__ == "__main__":
179+
unittest.main()

tpcav/__init__.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Lightweight, reusable TCAV utilities built from the repository scripts.
3+
4+
The package keeps existing scripts untouched while offering programmatic
5+
access to concept construction and PCA/attribution workflows.
6+
"""
7+
8+
import logging
9+
10+
# Set the logging level to INFO
11+
logging.basicConfig(level=logging.INFO)
12+
13+
from .cavs import CavTrainer
14+
from .concepts import ConceptBuilder
15+
from .helper import (
16+
bed_to_chrom_tracks_iter,
17+
bed_to_fasta_iter,
18+
dataframe_to_chrom_tracks_iter,
19+
dataframe_to_fasta_iter,
20+
dinuc_shuffle_sequences,
21+
fasta_to_one_hot_sequences,
22+
random_regions_dataframe,
23+
)
24+
from .logging_utils import set_verbose
25+
from .tpcav_model import TPCAV
26+
27+
__all__ = [
28+
"ConceptBuilder",
29+
"CavTrainer",
30+
"TPCAV",
31+
"bed_to_fasta_iter",
32+
"dataframe_to_fasta_iter",
33+
"bed_to_chrom_tracks_iter",
34+
"dataframe_to_chrom_tracks_iter",
35+
"fasta_to_one_hot_sequences",
36+
"random_regions_dataframe",
37+
"dinuc_shuffle_sequences",
38+
"set_verbose",
39+
]

0 commit comments

Comments
 (0)