Skip to content

Commit f2873a9

Browse files
committed
initial commit HIV_samples
This commit includes the initial READMEs to outline the directory structure of this directory along with the HIV genome used (saved to input) and the updated setup.sh script for creating a new hg19-based EpitopeID database that includes the HIV genome as an "epitope"/"FASTA_Tag".
1 parent 4fe86c3 commit f2873a9

6 files changed

Lines changed: 252 additions & 5 deletions

File tree

paper/HIV_samples/README

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Run EpitopeID on HIV datasets to evaluate EpitopeID's ability to detect HIV genome insertions
2+
3+
# "Benzotriazoles Reactivate Latent HIV-1 through Inactivation of STAT5 SUMOylation"
4+
# (Bosque et al, 2017)
5+
6+
# GEO accession: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE84199
7+
# HIV genome: https://www.ncbi.nlm.nih.gov/nuccore/AF324493
8+
9+
# EpitopeID database with HIV genome as a tag and hg19 genome as genomic sequence is setup with `../setup.sh`
10+
# Download data using SRA accessions using `job/00_download_data.pbs`
11+
# Run EpitopeID on FASTQ inputs using `job/01_run_EpitopeID.pbs` to determine if EpitopeID can localize HIV genome insertions

paper/HIV_samples/logs/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# logfiles from STDERR and STDOUT of running job files go here

paper/HIV_samples/results/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Downloaded FASTQ files and EpitopeID results go here

paper/README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ paper
1414
|--SyntheticDeletion
1515
|--ENCODEdata-eGFP
1616
|--ENCODEdata-CellLines
17+
|--HIV_samples
1718
|--YKOC-wgs
1819

1920

@@ -47,5 +48,8 @@ contains the scripts and information for downloading ENCODE eGFP data to test Ep
4748
## ENCODEdata-CellLines
4849
contains the scripts and information for downloading ENCODE transcription factor ChIP-seq data to test StrainID
4950

51+
## HIV_samples
52+
contains the scripts and information for downloading, processing, and running EpitopeID on the Bosque et al, 2017 dataset for localizing HIV genome insertions
53+
5054
## YKOC-wgs
5155
contains the scripts and information for downloading, processing, and running DeletionID on the Puddu et al, 2019 dataset for identifying deletions
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
>AF324493.2 HIV-1 vector pNL4-3, complete sequence
2+
TGGAAGGGCTAATTTGGTCCCAAAAAAGACAAGAGATCCTTGATCTGTGGATCTACCACACACAAGGCTA
3+
CTTCCCTGATTGGCAGAACTACACACCAGGGCCAGGGATCAGATATCCACTGACCTTTGGATGGTGCTTC
4+
AAGTTAGTACCAGTTGAACCAGAGCAAGTAGAAGAGGCCAATGAAGGAGAGAACAACAGCTTGTTACACC
5+
CTATGAGCCAGCATGGGATGGAGGACCCGGAGGGAGAAGTATTAGTGTGGAAGTTTGACAGCCTCCTAGC
6+
ATTTCGTCACATGGCCCGAGAGCTGCATCCGGAGTACTACAAAGACTGCTGACATCGAGCTTTCTACAAG
7+
GGACTTTCCGCTGGGGACTTTCCAGGGAGGTGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGAT
8+
GCTACATATAAGCAGCTGCTTTTTGCCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGC
9+
TCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTCAAAGTAGTGTG
10+
TGCCCGTCTGTTGTGTGACTCTGGTAACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCT
11+
AGCAGTGGCGCCCGAACAGGGACTTGAAAGCGAAAGTAAAGCCAGAGGAGATCTCTCGACGCAGGACTCG
12+
GCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAATTTTGACTAGC
13+
GGAGGCTAGAAGGAGAGAGATGGGTGCGAGAGCGTCGGTATTAAGCGGGGGAGAATTAGATAAATGGGAA
14+
AAAATTCGGTTAAGGCCAGGGGGAAAGAAACAATATAAACTAAAACATATAGTATGGGCAAGCAGGGAGC
15+
TAGAACGATTCGCAGTTAATCCTGGCCTTTTAGAGACATCAGAAGGCTGTAGACAAATACTGGGACAGCT
16+
ACAACCATCCCTTCAGACAGGATCAGAAGAACTTAGATCATTATATAATACAATAGCAGTCCTCTATTGT
17+
GTGCATCAAAGGATAGATGTAAAAGACACCAAGGAAGCCTTAGATAAGATAGAGGAAGAGCAAAACAAAA
18+
GTAAGAAAAAGGCACAGCAAGCAGCAGCTGACACAGGAAACAACAGCCAGGTCAGCCAAAATTACCCTAT
19+
AGTGCAGAACCTCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCATGGGTAAAA
20+
GTAGTAGAAGAGAAGGCTTTCAGCCCAGAAGTAATACCCATGTTTTCAGCATTATCAGAAGGAGCCACCC
21+
CACAAGATTTAAATACCATGCTAAACACAGTGGGGGGACATCAAGCAGCCATGCAAATGTTAAAAGAGAC
22+
CATCAATGAGGAAGCTGCAGAATGGGATAGATTGCATCCAGTGCATGCAGGGCCTATTGCACCAGGCCAG
23+
ATGAGAGAACCAAGGGGAAGTGACATAGCAGGAACTACTAGTACCCTTCAGGAACAAATAGGATGGATGA
24+
CACATAATCCACCTATCCCAGTAGGAGAAATCTATAAAAGATGGATAATCCTGGGATTAAATAAAATAGT
25+
AAGAATGTATAGCCCTACCAGCATTCTGGACATAAGACAAGGACCAAAGGAACCCTTTAGAGACTATGTA
26+
GACCGATTCTATAAAACTCTAAGAGCCGAGCAAGCTTCACAAGAGGTAAAAAATTGGATGACAGAAACCT
27+
TGTTGGTCCAAAATGCGAACCCAGATTGTAAGACTATTTTAAAAGCATTGGGACCAGGAGCGACACTAGA
28+
AGAAATGATGACAGCATGTCAGGGAGTGGGGGGACCCGGCCATAAAGCAAGAGTTTTGGCTGAAGCAATG
29+
AGCCAAGTAACAAATCCAGCTACCATAATGATACAGAAAGGCAATTTTAGGAACCAAAGAAAGACTGTTA
30+
AGTGTTTCAATTGTGGCAAAGAAGGGCACATAGCCAAAAATTGCAGGGCCCCTAGGAAAAAGGGCTGTTG
31+
GAAATGTGGAAAGGAAGGACACCAAATGAAAGATTGTACTGAGAGACAGGCTAATTTTTTAGGGAAGATC
32+
TGGCCTTCCCACAAGGGAAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCAACAGCCCCACCAGAAG
33+
AGAGCTTCAGGTTTGGGGAAGAGACAACAACTCCCTCTCAGAAGCAGGAGCCGATAGACAAGGAACTGTA
34+
TCCTTTAGCTTCCCTCAGATCACTCTTTGGCAGCGACCCCTCGTCACAATAAAGATAGGGGGGCAATTAA
35+
AGGAAGCTCTATTAGATACAGGAGCAGATGATACAGTATTAGAAGAAATGAATTTGCCAGGAAGATGGAA
36+
ACCAAAAATGATAGGGGGAATTGGAGGTTTTATCAAAGTAAGACAGTATGATCAGATACTCATAGAAATC
37+
TGCGGACATAAAGCTATAGGTACAGTATTAGTAGGACCTACACCTGTCAACATAATTGGAAGAAATCTGT
38+
TGACTCAGATTGGCTGCACTTTAAATTTTCCCATTAGTCCTATTGAGACTGTACCAGTAAAATTAAAGCC
39+
AGGAATGGATGGCCCAAAAGTTAAACAATGGCCATTGACAGAAGAAAAAATAAAAGCATTAGTAGAAATT
40+
TGTACAGAAATGGAAAAGGAAGGAAAAATTTCAAAAATTGGGCCTGAAAATCCATACAATACTCCAGTAT
41+
TTGCCATAAAGAAAAAAGACAGTACTAAATGGAGAAAATTAGTAGATTTCAGAGAACTTAATAAGAGAAC
42+
TCAAGATTTCTGGGAAGTTCAATTAGGAATACCACATCCTGCAGGGTTAAAACAGAAAAAATCAGTAACA
43+
GTACTGGATGTGGGCGATGCATATTTTTCAGTTCCCTTAGATAAAGACTTCAGGAAGTATACTGCATTTA
44+
CCATACCTAGTATAAACAATGAGACACCAGGGATTAGATATCAGTACAATGTGCTTCCACAGGGATGGAA
45+
AGGATCACCAGCAATATTCCAGTGTAGCATGACAAAAATCTTAGAGCCTTTTAGAAAACAAAATCCAGAC
46+
ATAGTCATCTATCAATACATGGATGATTTGTATGTAGGATCTGACTTAGAAATAGGGCAGCATAGAACAA
47+
AAATAGAGGAACTGAGACAACATCTGTTGAGGTGGGGATTTACCACACCAGACAAAAAACATCAGAAAGA
48+
ACCTCCATTCCTTTGGATGGGTTATGAACTCCATCCTGATAAATGGACAGTACAGCCTATAGTGCTGCCA
49+
GAAAAGGACAGCTGGACTGTCAATGACATACAGAAATTAGTGGGAAAATTGAATTGGGCAAGTCAGATTT
50+
ATGCAGGGATTAAAGTAAGGCAATTATGTAAACTTCTTAGGGGAACCAAAGCACTAACAGAAGTAGTACC
51+
ACTAACAGAAGAAGCAGAGCTAGAACTGGCAGAAAACAGGGAGATTCTAAAAGAACCGGTACATGGAGTG
52+
TATTATGACCCATCAAAAGACTTAATAGCAGAAATACAGAAGCAGGGGCAAGGCCAATGGACATATCAAA
53+
TTTATCAAGAGCCATTTAAAAATCTGAAAACAGGAAAGTATGCAAGAATGAAGGGTGCCCACACTAATGA
54+
TGTGAAACAATTAACAGAGGCAGTACAAAAAATAGCCACAGAAAGCATAGTAATATGGGGAAAGACTCCT
55+
AAATTTAAATTACCCATACAAAAGGAAACATGGGAAGCATGGTGGACAGAGTATTGGCAAGCCACCTGGA
56+
TTCCTGAGTGGGAGTTTGTCAATACCCCTCCCTTAGTGAAGTTATGGTACCAGTTAGAGAAAGAACCCAT
57+
AATAGGAGCAGAAACTTTCTATGTAGATGGGGCAGCCAATAGGGAAACTAAATTAGGAAAAGCAGGATAT
58+
GTAACTGACAGAGGAAGACAAAAAGTTGTCCCCCTAACGGACACAACAAATCAGAAGACTGAGTTACAAG
59+
CAATTCATCTAGCTTTGCAGGATTCGGGATTAGAAGTAAACATAGTGACAGACTCACAATATGCATTGGG
60+
AATCATTCAAGCACAACCAGATAAGAGTGAATCAGAGTTAGTCAGTCAAATAATAGAGCAGTTAATAAAA
61+
AAGGAAAAAGTCTACCTGGCATGGGTACCAGCACACAAAGGAATTGGAGGAAATGAACAAGTAGATAAAT
62+
TGGTCAGTGCTGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGCCCAAGAAGAACATGAGAA
63+
ATATCACAGTAATTGGAGAGCAATGGCTAGTGATTTTAACCTACCACCTGTAGTAGCAAAAGAAATAGTA
64+
GCCAGCTGTGATAAATGTCAGCTAAAAGGGGAAGCCATGCATGGACAAGTAGACTGTAGCCCAGGAATAT
65+
GGCAGCTAGATTGTACACATTTAGAAGGAAAAGTTATCTTGGTAGCAGTTCATGTAGCCAGTGGATATAT
66+
AGAAGCAGAAGTAATTCCAGCAGAGACAGGGCAAGAAACAGCATACTTCCTCTTAAAATTAGCAGGAAGA
67+
TGGCCAGTAAAAACAGTACATACAGACAATGGCAGCAATTTCACCAGTACTACAGTTAAGGCCGCCTGTT
68+
GGTGGGCGGGGATCAAGCAGGAATTTGGCATTCCCTACAATCCCCAAAGTCAAGGAGTAATAGAATCTAT
69+
GAATAAAGAATTAAAGAAAATTATAGGACAGGTAAGAGATCAGGCTGAACATCTTAAGACAGCAGTACAA
70+
ATGGCAGTATTCATCCACAATTTTAAAAGAAAAGGGGGGATTGGGGGGTACAGTGCAGGGGAAAGAATAG
71+
TAGACATAATAGCAACAGACATACAAACTAAAGAATTACAAAAACAAATTACAAAAATTCAAAATTTTCG
72+
GGTTTATTACAGGGACAGCAGAGATCCAGTTTGGAAAGGACCAGCAAAGCTCCTCTGGAAAGGTGAAGGG
73+
GCAGTAGTAATACAAGATAATAGTGACATAAAAGTAGTGCCAAGAAGAAAAGCAAAGATCATCAGGGATT
74+
ATGGAAAACAGATGGCAGGTGATGATTGTGTGGCAAGTAGACAGGATGAGGATTAACACATGGAAAAGAT
75+
TAGTAAAACACCATATGTATATTTCAAGGAAAGCTAAGGACTGGTTTTATAGACATCACTATGAAAGTAC
76+
TAATCCAAAAATAAGTTCAGAAGTACACATCCCACTAGGGGATGCTAAATTAGTAATAACAACATATTGG
77+
GGTCTGCATACAGGAGAAAGAGACTGGCATTTGGGTCAGGGAGTCTCCATAGAATGGAGGAAAAAGAGAT
78+
ATAGCACACAAGTAGACCCTGACCTAGCAGACCAACTAATTCATCTGCACTATTTTGATTGTTTTTCAGA
79+
ATCTGCTATAAGAAATACCATATTAGGACGTATAGTTAGTCCTAGGTGTGAATATCAAGCAGGACATAAC
80+
AAGGTAGGATCTCTACAGTACTTGGCACTAGCAGCATTAATAAAACCAAAACAGATAAAGCCACCTTTGC
81+
CTAGTGTTAGGAAACTGACAGAGGACAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCA
82+
TACAATGAATGGACACTAGAGCTTTTAGAGGAACTTAAGAGTGAAGCTGTTAGACATTTTCCTAGGATAT
83+
GGCTCCATAACTTAGGACAACATATCTATGAAACTTACGGGGATACTTGGGCAGGAGTGGAAGCCATAAT
84+
AAGAATTCTGCAACAACTGCTGTTTATCCATTTCAGAATTGGGTGTCGACATAGCAGAATAGGCGTTACT
85+
CGACAGAGGAGAGCAAGAAATGGAGCCAGTAGATCCTAGACTAGAGCCCTGGAAGCATCCAGGAAGTCAG
86+
CCTAAAACTGCTTGTACCAATTGCTATTGTAAAAAGTGTTGCTTTCATTGCCAAGTTTGTTTCATGACAA
87+
AAGCCTTAGGCATCTCCTATGGCAGGAAGAAGCGGAGACAGCGACGAAGAGCTCATCAGAACAGTCAGAC
88+
TCATCAAGCTTCTCTATCAAAGCAGTAAGTAGTACATGTAATGCAACCTATAATAGTAGCAATAGTAGCA
89+
TTAGTAGTAGCAATAATAATAGCAATAGTTGTGTGGTCCATAGTAATCATAGAATATAGGAAAATATTAA
90+
GACAAAGAAAAATAGACAGGTTAATTGATAGACTAATAGAAAGAGCAGAAGACAGTGGCAATGAGAGTGA
91+
AGGAGAAGTATCAGCACTTGTGGAGATGGGGGTGGAAATGGGGCACCATGCTCCTTGGGATATTGATGAT
92+
CTGTAGTGCTACAGAAAAATTGTGGGTCACAGTCTATTATGGGGTACCTGTGTGGAAGGAAGCAACCACC
93+
ACTCTATTTTGTGCATCAGATGCTAAAGCATATGATACAGAGGTACATAATGTTTGGGCCACACATGCCT
94+
GTGTACCCACAGACCCCAACCCACAAGAAGTAGTATTGGTAAATGTGACAGAAAATTTTAACATGTGGAA
95+
AAATGACATGGTAGAACAGATGCATGAGGATATAATCAGTTTATGGGATCAAAGCCTAAAGCCATGTGTA
96+
AAATTAACCCCACTCTGTGTTAGTTTAAAGTGCACTGATTTGAAGAATGATACTAATACCAATAGTAGTA
97+
GCGGGAGAATGATAATGGAGAAAGGAGAGATAAAAAACTGCTCTTTCAATATCAGCACAAGCATAAGAGA
98+
TAAGGTGCAGAAAGAATATGCATTCTTTTATAAACTTGATATAGTACCAATAGATAATACCAGCTATAGG
99+
TTGATAAGTTGTAACACCTCAGTCATTACACAGGCCTGTCCAAAGGTATCCTTTGAGCCAATTCCCATAC
100+
ATTATTGTGCCCCGGCTGGTTTTGCGATTCTAAAATGTAATAATAAGACGTTCAATGGAACAGGACCATG
101+
TACAAATGTCAGCACAGTACAATGTACACATGGAATCAGGCCAGTAGTATCAACTCAACTGCTGTTAAAT
102+
GGCAGTCTAGCAGAAGAAGATGTAGTAATTAGATCTGCCAATTTCACAGACAATGCTAAAACCATAATAG
103+
TACAGCTGAACACATCTGTAGAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAAGTATCCGTAT
104+
CCAGAGGGGACCAGGGAGAGCATTTGTTACAATAGGAAAAATAGGAAATATGAGACAAGCACATTGTAAC
105+
ATTAGTAGAGCAAAATGGAATGCCACTTTAAAACAGATAGCTAGCAAATTAAGAGAACAATTTGGAAATA
106+
ATAAAACAATAATCTTTAAGCAATCCTCAGGAGGGGACCCAGAAATTGTAACGCACAGTTTTAATTGTGG
107+
AGGGGAATTTTTCTACTGTAATTCAACACAACTGTTTAATAGTACTTGGTTTAATAGTACTTGGAGTACT
108+
GAAGGGTCAAATAACACTGAAGGAAGTGACACAATCACACTCCCATGCAGAATAAAACAATTTATAAACA
109+
TGTGGCAGGAAGTAGGAAAAGCAATGTATGCCCCTCCCATCAGTGGACAAATTAGATGTTCATCAAATAT
110+
TACTGGGCTGCTATTAACAAGAGATGGTGGTAATAACAACAATGGGTCCGAGATCTTCAGACCTGGAGGA
111+
GGCGATATGAGGGACAATTGGAGAAGTGAATTATATAAATATAAAGTAGTAAAAATTGAACCATTAGGAG
112+
TAGCACCCACCAAGGCAAAGAGAAGAGTGGTGCAGAGAGAAAAAAGAGCAGTGGGAATAGGAGCTTTGTT
113+
CCTTGGGTTCTTGGGAGCAGCAGGAAGCACTATGGGCGCAGCGTCAATGACGCTGACGGTACAGGCCAGA
114+
CAATTATTGTCTGATATAGTGCAGCAGCAGAACAATTTGCTGAGGGCTATTGAGGCGCAACAGCATCTGT
115+
TGCAACTCACAGTCTGGGGCATCAAACAGCTCCAGGCAAGAATCCTGGCTGTGGAAAGATACCTAAAGGA
116+
TCAACAGCTCCTGGGGATTTGGGGTTGCTCTGGAAAACTCATTTGCACCACTGCTGTGCCTTGGAATGCT
117+
AGTTGGAGTAATAAATCTCTGGAACAGATTTGGAATAACATGACCTGGATGGAGTGGGACAGAGAAATTA
118+
ACAATTACACAAGCTTAATACACTCCTTAATTGAAGAATCGCAAAACCAGCAAGAAAAGAATGAACAAGA
119+
ATTATTGGAATTAGATAAATGGGCAAGTTTGTGGAATTGGTTTAACATAACAAATTGGCTGTGGTATATA
120+
AAATTATTCATAATGATAGTAGGAGGCTTGGTAGGTTTAAGAATAGTTTTTGCTGTACTTTCTATAGTGA
121+
ATAGAGTTAGGCAGGGATATTCACCATTATCGTTTCAGACCCACCTCCCAATCCCGAGGGGACCCGACAG
122+
GCCCGAAGGAATAGAAGAAGAAGGTGGAGAGAGAGACAGAGACAGATCCATTCGATTAGTGAACGGATCC
123+
TTAGCACTTATCTGGGACGATCTGCGGAGCCTGTGCCTCTTCAGCTACCACCGCTTGAGAGACTTACTCT
124+
TGATTGTAACGAGGATTGTGGAACTTCTGGGACGCAGGGGGTGGGAAGCCCTCAAATATTGGTGGAATCT
125+
CCTACAGTATTGGAGTCAGGAACTAAAGAATAGTGCTGTTAACTTGCTCAATGCCACAGCCATAGCAGTA
126+
GCTGAGGGGACAGATAGGGTTATAGAAGTATTACAAGCAGCTTATAGAGCTATTCGCCACATACCTAGAA
127+
GAATAAGACAGGGCTTGGAAAGGATTTTGCTATAAGATGGGTGGCAAGTGGTCAAAAAGTAGTGTGATTG
128+
GATGGCCTGCTGTAAGGGAAAGAATGAGACGAGCTGAGCCAGCAGCAGATGGGGTGGGAGCAGTATCTCG
129+
AGACCTAGAAAAACATGGAGCAATCACAAGTAGCAATACAGCAGCTAACAATGCTGCTTGTGCCTGGCTA
130+
GAAGCACAAGAGGAGGAAGAGGTGGGTTTTCCAGTCACACCTCAGGTACCTTTAAGACCAATGACTTACA
131+
AGGCAGCTGTAGATCTTAGCCACTTTTTAAAAGAAAAGGGGGGACTGGAAGGGCTAATTCACTCCCAAAG
132+
AAGACAAGATATCCTTGATCTGTGGATCTACCACACACAAGGCTACTTCCCTGATTGGCAGAACTACACA
133+
CCAGGGCCAGGGGTCAGATATCCACTGACCTTTGGATGGTGCTACAAGCTAGTACCAGTTGAGCCAGATA
134+
AGGTAGAAGAGGCCAATAAAGGAGAGAACACCAGCTTGTTACACCCTGTGAGCCTGCATGGAATGGATGA
135+
CCCTGAGAGAGAAGTGTTAGAGTGGAGGTTTGACAGCCGCCTAGCATTTCATCACGTGGCCCGAGAGCTG
136+
CATCCGGAGTACTTCAAGAACTGCTGACATCGAGCTTGCTACAAGGGACTTTCCGCTGGGGACTTTCCAG
137+
GGAGGCGTGGCCTGGGCGGGACTGGGGAGTGGCGAGCCCTCAGATGCTGCATATAAGCAGCTGCTTTTTG
138+
CCTGTACTGGGTCTCTCTGGTTAGACCAGATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACT
139+
GCTTAAGCCTCAATAAAGCTTGCCTTGAGTGCTTCAAGTAGTGTGTGCCCGTCTGTTGTGTGACTCTGGT
140+
AACTAGAGATCCCTCAGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCACCCAGGAGGTAGAGGTTGCAG
141+
TGAGCCAAGATCGCGCCACTGCATTCCAGCCTGGGCAAGAAAACAAGACTGTCTAAAATAATAATAATAA
142+
GTTAAGGGTATTAAATATATTTATACATGGAGGTCATAAAAATATATATATTTGGGCTGGGCGCAGTGGC
143+
TCACACCTGCGCCCGGCCCTTTGGGAGGCCGAGGCAGGTGGATCACCTGAGTTTGGGAGTTCCAGACCAG
144+
CCTGACCAACATGGAGAAACCCCTTCTCTGTGTATTTTTAGTAGATTTTATTTTATGTGTATTTTATTCA
145+
CAGGTATTTCTGGAAAACTGAAACTGTTTTTCCTCTACTCTGATACCACAAGAATCATCAGCACAGAGGA
146+
AGACTTCTGTGATCAAATGTGGTGGGAGAGGGAGGTTTTCACCAGCACATGAGCAGTCAGTTCTGCCGCA
147+
GACTCGGCGGGTGTCCTTCGGTTCAGTTCCAACACCGCCTGCCTGGAGAGAGGTCAGACCACAGGGTGAG
148+
GGCTCAGTCCCCAAGACATAAACACCCAAGACATAAACACCCAACAGGTCCACCCCGCCTGCTGCCCAGG
149+
CAGAGCCGATTCACCAAGACGGGAATTAGGATAGAGAAAGAGTAAGTCACACAGAGCCGGCTGTGCGGGA
150+
GAACGGAGTTCTATTATGACTCAAATCAGTCTCCCCAAGCATTCGGGGATCAGAGTTTTTAAGGATAACT
151+
TAGTGTGTAGGGGGCCAGTGAGTTGGAGATGAAAGCGTAGGGAGTCGAAGGTGTCCTTTTGCGCCGAGTC
152+
AGTTCCTGGGTGGGGGCCACAAGATCGGATGAGCCAGTTTATCAATCCGGGGGTGCCAGCTGATCCATGG
153+
AGTGCAGGGTCTGCAAAATATCTCAAGCACTGATTGATCTTAGGTTTTACAATAGTGATGTTACCCCAGG
154+
AACAATTTGGGGAAGGTCAGAATCTTGTAGCCTGTAGCTGCATGACTCCTAAACCATAATTTCTTTTTTG
155+
TTTTTTTTTTTTTATTTTTGAGACAGGGTCTCACTCTGTCACCTAGGCTGGAGTGCAGTGGTGCAATCAC
156+
AGCTCACTGCAGCCTCAACGTCGTAAGCTCAAGCGATCCTCCCACCTCAGCCTGCCTGGTAGCTGAGACT
157+
ACAAGCGACGCCCCAGTTAATTTTTGTATTTTTGGTAGAGGCAGCGTTTTGCCGTGTGGCCCTGGCTGGT
158+
CTCGAACTCCTGGGCTCAAGTGATCCAGCCTCAGCCTCCCAAAGTGCTGGGACAACCGGGGCCAGTCACT
159+
GCACCTGGCCCTAAACCATAATTTCTAATCTTTTGGCTAATTTGTTAGTCCTACAAAGGCAGTCTAGTCC
160+
CCAGGCAAAAAGGGGGTTTGTTTCGGGAAAGGGCTGTTACTGTCTTTGTTTCAAACTATAAACTAAGTTC
161+
CTCCTAAACTTAGTTCGGCCTACACCCAGGAATGAACAAGGAGAGCTTGGAGGTTAGAAGCACGATGGAA
162+
TTGGTTAGGTCAGATCTCTTTCACTGTCTGAGTTATAATTTTGCAATGGTGGTTCAAAGACTGCCCGCTT
163+
CTGACACCAGTCGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCT
164+
TCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAA
165+
AGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCA
166+
AAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCAT
167+
CACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCC
168+
CTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCC
169+
TTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCC
170+
AAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTG
171+
AGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAG
172+
GTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTT
173+
GGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAA
174+
CCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGA
175+
AGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTC
176+
ATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAA
177+
GTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTG
178+
TCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCA
179+
TCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACC
180+
AGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTG
181+
TTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGC
182+
ATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTA
183+
CATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTT
184+
GGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGA
185+
TGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCT
186+
CTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAA
187+
ACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGT
188+
GCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAA
189+
ATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTA
190+
TTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAA
191+
ATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACAT
192+
TAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTCGCGCGTTTCGGTGATGACGGTGAAAACCT
193+
CTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGT
194+
CAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTAC
195+
TGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCA
196+
TTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGGGG
197+
AGGCAGAGATTGCAGTAAGCTGAGATCGCAGCACTGCACTCCAGCCTGGGCGACAGAGTAAGACTCTGTC
198+
TCAAAAATAAAATAAATAAATCAATCAGATATTCCAATCTTTTCCTTTATTTATTTATTTATTTTCTATT
199+
TTGGAAACACAGTCCTTCCTTATTCCAGAATTACACATATATTCTATTTTTCTTTATATGCTCCAGTTTT
200+
TTTTAGACCTTCACCTGAAATGTGTGTATACAAAATCTAGGCCAGTCCAGCAGAGCCTAAAGGTAAAAAA
201+
TAAAATAATAAAAAATAAATAAAATCTAGCTCACTCCTTCACATCAAAATGGAGATACAGCTGTTAGCAT
202+
TAAATACCAAATAACCCATCTTGTCCTCAATAATTTTAAGCGCCTCTCTCCACCACATCTAACTCCTGTC
203+
AAAGGCATGTGCCCCTTCCGGGCGCTCTGCTGTGCTGCCAACCAACTGGCATGTGGACTCTGCAGGGTCC
204+
CTAACTGCCAAGCCCCACAGTGTGCCCTGAGGCTGCCCCTTCCTTCTAGCGGCTGCCCCCACTCGGCTTT
205+
GCTTTCCCTAGTTTCAGTTACTTGCGTTCAGCCAAGGTCTGAAACTAGGTGCGCACAGAGCGGTAAGACT
206+
GCGAGAGAAAGAGACCAGCTTTACAGGGGGTTTATCACAGTGCACCCTGACAGTCGTCAGCCTCACAGGG
207+
GGTTTATCACATTGCACCCTGACAGTCGTCAGCCTCACAGGGGGTTTATCACAGTGCACCCTTACAATCA
208+
TTCCATTTGATTCACAATTTTTTTAGTCTCTACTGTGCCTAACTTGTAAGTTAAATTTGATCAGAGGTGT
209+
GTTCCCAGAGGGGAAAACAGTATATACAGGGTTCAGTACTATCGCATTTCAGGCCTCCACCTGGGTCTTG
210+
GAATGTGTCCCCCGAGGGGTGATGACTACCTCAGTTGGATCTCCACAGGTCACAGTGACACAAGATAACC
211+
AAGACACCTCCCAAGGCTACCACAATGGGCCGCCCTCCACGTGCACATGGCCGGAGGAACTGCCATGTCG
212+
GAGGTGCAAGCACACCTGCGCATCAGAGTCCTTGGTGTGGAGGGAGGGACCAGCGCAGCTTCCAGCCATC
213+
CACCTGATGAACAGAACCTAGGGAAAGCCCCAGTTCTACTTACACCAGGAAAGGC
214+

paper/setup.sh

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,17 @@
1515
# wget
1616
# Perl 5.18+
1717
# bwa v0.7.14+
18-
#
18+
#
1919
# Optional software:
2020
# twoBitToFa
2121

22-
WRK=/path/to/GenoPipe/paper/YKOC-wgs
22+
WRK=/path/to/GenoPipe/paper
2323
cd $WRK
2424

2525
module load gcc
2626
module load bwa
2727
module load samtools
2828

29-
3029
[ -d $WRK/input ] || mkdir $WRK/input
3130
[ -d $WRK/db ] || mkdir $WRK/db
3231

@@ -120,16 +119,33 @@ mv ALL_TAG.fa* ../
120119
echo "Complete"
121120
cd $WRK
122121

122+
# Build Human EpiID with HIV genome
123+
HHIVDB=db/hiv_EpiID
124+
[ -d $HHIVDB ] || cp -r ../EpitopeID/hg19_EpiID/ $HHIVDB
125+
if [ ! -f $HHIVDB/FASTA_genome/genome.fa ]; then
126+
echo "**Setup Human EpiID genome..."
127+
cp input/hg19.fa $HHIVDB/FASTA_genome/genome.fa
128+
echo "BWA Indexing genome..."
129+
bwa index $HHIVDB/FASTA_genome/genome.fa
130+
echo "Complete"
131+
fi
132+
echo "Setup HIV genome for Human EpiID..."
133+
cp input/AF324493.2_HIV-1_vector_pNL4-3.fa $HHIVDB/FASTA_tag/Tag_DB/
134+
rm $HHIVDB/FASTA_tag/ALL_TAG.fa*
135+
cd $HHIVDB/FASTA_tag/Tag_DB
136+
cat *.fa *.fna *.ffn *.fasta > ALL_TAG.fa
137+
bwa index ALL_TAG.fa
138+
mv ALL_TAG.fa* ../
139+
echo "Complete"
140+
cd $WRK
123141

124142
# Add Yeast Deletion DB to paper/db
125143
cd $WRK/db
126144
ln -s ../../DeletionID/sacCer3_Del
127145
cd $WRK
128146

129-
130147
# Add Yeast & Human StrainID DB to paper/db
131148
cd $WRK/db
132149
ln -s ../../StrainID/sacCer3_VCF
133150
ln -s ../../StrainID/hg19_VCF
134151
cd $WRK
135-

0 commit comments

Comments
 (0)