NGSOmics_Programming/ATACSeq/PostAlignment.Rmd at main · SciComp8/NGSOmics_Programming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
---
title: "ATAC-Seq Data Analysis with Human Data - Perform post-alignment processing"
author: "Anni Liu"
date: '`r format(Sys.Date(), "%B %d, %Y")`'
output:
  html_document:
    code_folding: show
---

```{r, shorcut, include=FALSE}
## RStudio keyboard shortcut
# Cursor at the beginning of a command line: Ctrl+A
# Cursor at the end of a command line: Ctrl+E
# Clear all the code from your console: Ctrl+L
# Create a pipe operator %>%: Ctrl+Shift+M (Windows) or Cmd+Shift+M (Mac)
# Create an assignment operator <-: Alt+- (Windows) or Option+- (Mac)
# Knit a document (knitr): Ctrl+Shift+K (Windows) or Cmd+Shift+K (Mac)
# Comment or uncomment current selection: Ctrl+Shift+C (Windows) or Cmd+Shift+C (Mac)
```


# [QC] Perform post-alignment processing
## Select properly paired reads using `ScanBamParam()` and `scanBamFlag()`
```{r}
library(GenomicAlignments)
flags <- scanBamFlag(isProperPair = T)
# Here we only select reads paired in alignment within the preset max fragment length 600 bp
# isProperPair = T: represent reads aligning to identical reference sequences and with a specified distance, 600 bp in our case.

# Recap maxFragLength = 600 # Line 501: Perform the alignment relative to hg38 using the new ATACseq data from female lung
# align(index = "BSgenome.Hsapiens.UCSC.hg38.mainchr",
#       readfile1 = "./data/SRR6870408_1.fastq.gz",
#       readfile2 = "./data/SRR6870408_2.fastq.gz",
#       output_file = "./data/ATAC_female_lung.bam",
#       nthreads = 4, type = 1,
#       unique = TRUE, maxFragLength = 600)

# Set up parameters for scanning BAM files
chr20_param  <- ScanBamParam(flag = flags,
                             what = c("qname", "mapq", "isize"),
                             which = GRanges("chr20", IRanges(1, 64444167)))

# https://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&chromInfoPage=
# 63025520: length of chromosome 20 (GRCh37/hg19)
# https://genome.ucsc.edu/cgi-bin/hgTracks?hgsid=1795230954_XcEyjdAa9Z968dnsdwUvHaqjwvOg&chromInfoPage=
# 64444167: length of chromosome 20 (GRCh38/hg38)
# qname: reads name
# mapq: quality of alignment
# isize: insert size of our fragment
chr20_param

# class: ScanBamParam
# bamFlag (NA unless specified): isProperPair=TRUE
# bamSimpleCigar: FALSE
# bamReverseComplement: FALSE
# bamTag:
# bamTagFilter:
# bamWhich: 1 ranges
# bamWhat: qname, mapq, isize
# bamMapqFilter: NA
```

```{r two.chromosomes}
chr1X_param <- ScanBamParam(flag = flags,
                            what = c("qname", "mapq", "isize"),
                            which = IRangesList(chr1=IRanges(1, 248956422),
                                                chrX=IRanges(1, 156040895)))
```

```{r all.chromosomes}
# Approach 1:
# https://genome.ucsc.edu/cgi-bin/hgTracks?hgsid=1795230954_XcEyjdAa9Z968dnsdwUvHaqjwvOg&chromInfoPage=
scanBamHeader(sorted_bam)[[1]]$targets
mainchr_param <- ScanBamParam(flag = flags,
                              what = c("qname", "mapq", "isize"),
                              which = IRangesList(chr1=IRanges(1, 248956422),
                                                  chr2=IRanges(1, 242193529),
                                                  chr3=IRanges(1, 198295559),
                                                  chr4=IRanges(1, 190214555),
                                                  chr5=IRanges(1, 181538259),
                                                  chr6=IRanges(1, 170805979),
                                                  chr7=IRanges(1, 159345973),
                                                  chr8=IRanges(1, 145138636),
                                                  chr9=IRanges(1, 138394717),
                                                  chr10=IRanges(1, 133797422),
                                                  chr11=IRanges(1, 135086622),
                                                  chr12=IRanges(1, 133275309),
                                                  chr13=IRanges(1, 114364328),
                                                  chr14=IRanges(1, 107043718),
                                                  chr15=IRanges(1, 101991189),
                                                  chr16=IRanges(1, 90338345),
                                                  chr17=IRanges(1, 83257441),
                                                  chr18=IRanges(1, 80373285),
                                                  chr19=IRanges(1, 58617616),
                                                  chr20=IRanges(1, 64444167),
                                                  chr21=IRanges(1, 46709983),
                                                  chr22=IRanges(1, 50818468),
                                                  chrX=IRanges(1, 156040895),
                                                  chrY=IRanges(1, 57227415)))


# Approach 2:
as(Seqinfo(genome = "hg38"), "GRanges")@seqnames@values
mainchr_param_2 <- ScanBamParam(flag = flags,
                                what = c("qname", "mapq", "isize"),
                                which = as(Seqinfo(genome="hg38"), "GRanges")[1:24])
mainchr_param_2@which@partitioning@NAMES <- paste0("chr", c(1:22, "X", "Y"))
mainchr_param_2@which@partitioning@end <- 1:24
```


## Read properly paired reads
```{r}
# Global view of the 2 reads, containing the seqnames, strand, ranges of 2 reads
atac_reads_chr20 <- readGAlignmentPairs(sorted_bam, param = chr20_param)
class(atac_reads_chr20)
# [1] "GAlignmentPairs"
# attr(,"package")
# [1] "GenomicAlignments"
atac_reads_chr20
# GAlignmentPairs object with 686908 pairs, strandMode=1, and 0 metadata columns:
#            seqnames strand   :            ranges  --            ranges
#               <Rle>  <Rle>   :         <IRanges>  --         <IRanges>
#        [1]    chr20      -   :       60100-60196  --       60007-60106
#        [2]    chr20      +   :       60246-60280  --       60248-60288
#        [3]    chr20      +   :       60255-60296  --       60257-60298
#        [4]    chr20      +   :       60260-60296  --       60262-60298
#        [5]    chr20      +   :       60261-60315  --       60263-60323
#        ...      ...    ... ...               ... ...               ...
#   [686904]    chr20      -   : 64332264-64332359  -- 64332180-64332268
#   [686905]    chr20      -   : 64332477-64332521  -- 64332469-64332513
#   [686906]    chr20      +   : 64332967-64333059  -- 64332969-64333061
#   [686907]    chr20      +   : 64333282-64333381  -- 64333312-64333411
#   [686908]    chr20      +   : 64334000-64334042  -- 64334002-64334044
#   -------
#   seqinfo: 25 sequences from an unspecified genome
atac_reads_chr20[1:2, ]

# GAlignmentPairs object with 4348926 pairs, strandMode=1, and 0 metadata columns:
#             seqnames strand   :              ranges  --              ranges
#                <Rle>  <Rle>   :           <IRanges>  --           <IRanges>
#         [1]     chr1      +   :          9996-10084  --         10277-10374
#         [2]     chr1      +   :          9997-10087  --         10082-10179
#         [3]     chr1      +   :         10003-10091  --         10123-10221
#         [4]     chr1      +   :         10004-10099  --         10009-10108
#         [5]     chr1      +   :         10004-10104  --         10037-10136
#         ...      ...    ... ...                 ... ...                 ...
#   [4348922]     chrX      -   : 156030710-156030739  -- 156030677-156030713
#   [4348923]     chrX      +   : 156030720-156030793  -- 156030723-156030795
#   [4348924]     chrX      +   : 156030726-156030759  -- 156030728-156030761
#   [4348925]     chrX      -   : 156030736-156030787  -- 156030725-156030785
#   [4348926]     chrX      -   : 156030748-156030783  -- 156030289-156030331
#   -------
#   seqinfo: 25 sequences from an unspecified genome

atac_reads_chr1X <- readGAlignmentPairs(sorted_bam, param = chr1X_param)
atac_reads_chr1X
# GAlignmentPairs object with 4348926 pairs, strandMode=1, and 0 metadata columns:
#             seqnames strand   :              ranges [read1]  --      ranges [read2]
#                <Rle>  <Rle>   :           <IRanges>  --           <IRanges>
#         [1]     chr1      +   :          9996-10084  --         10277-10374
#         [2]     chr1      +   :          9997-10087  --         10082-10179
#         [3]     chr1      +   :         10003-10091  --         10123-10221
#         [4]     chr1      +   :         10004-10099  --         10009-10108
#         [5]     chr1      +   :         10004-10104  --         10037-10136
#         ...      ...    ... ...                 ... ...                 ...
#   [4348922]     chrX      -   : 156030710-156030739  -- 156030677-156030713
#   [4348923]     chrX      +   : 156030720-156030793  -- 156030723-156030795
#   [4348924]     chrX      +   : 156030726-156030759  -- 156030728-156030761
#   [4348925]     chrX      -   : 156030736-156030787  -- 156030725-156030785
#   [4348926]     chrX      -   : 156030748-156030783  -- 156030289-156030331
#   -------
#   seqinfo: 25 sequences from an unspecified genome

# Gain information on the first or second/last alignment for each alignment pair in atac_reads_chr1X, containing the seqnames, strand, cigar, qwidth, start, end, width, qname, mapq, isize
read1 <- first(atac_reads_chr1X)
read2 <- second(atac_reads_chr1X)
read1[1, ] # This shows the first alignment in the first read
# GAlignments object with 1 alignment and 3 metadata columns:
#       seqnames strand       cigar    qwidth     start       end     width     njunc |              qname
#          <Rle>  <Rle> <character> <integer> <integer> <integer> <integer> <integer> |        <character>
#   [1]     chr1      +    30M2I59M        91      9996     10084        89         0 | SRR6870408.2987101
#            mapq     isize
#       <integer> <integer>
#   [1]         2       379
#   -------
#   seqinfo: 25 sequences from an unspecified genome
read2[1, ] # This shows the first alignment in the paired second read
# GAlignments object with 1 alignment and 3 metadata columns:
#       seqnames strand         cigar    qwidth     start       end     width     njunc |              qname
#          <Rle>  <Rle>   <character> <integer> <integer> <integer> <integer> <integer> |        <character>
#   [1]     chr1      - 15M1I62M1I21M       100     10277     10374        98         0 | SRR6870408.2987101
#            mapq     isize
#       <integer> <integer>
#   [1]         2      -379
#   -------
#   seqinfo: 25 sequences from an unspecified genome
```

```{r}
atac_reads_mainchr <- readGAlignmentPairs(sorted_bam, param = mainchr_param)
```


## Retrieve and summarize MapQ scores
```{r}
# Do our reads have the high-quality mapping?
read1_mapq <- mcols(read1)$mapq # mapq is a metadata column which can be accessed using mcols()
read2_mapq <- mcols(read2)$mapq
read1_mapq
read2_mapq

# *How our MapQ scores distribute across each read?
read1_mapq_dist <- table(read1_mapq); read2_mapq_dist <- table(read2_mapq)
read1_mapq_dist
read2_mapq_dist

# Visualize the distribution of MapQ scores per read
library(ggplot2)
plot_df <- data.frame(MapQ = c(names(read1_mapq_dist), names(read2_mapq_dist)),
                      Frequency = c(read1_mapq_dist, read2_mapq_dist),
                      Read = c(rep("Read1", length(read1_mapq_dist)),
                              rep("Read2", length(read2_mapq_dist))))
plot_df$MapQ <- factor(plot_df$MapQ,
                       levels = unique(sort(as.numeric(plot_df$MapQ))))
plot_df |>
  ggplot(mapping = aes(x = MapQ, y = Frequency, fill = MapQ)) +
    geom_bar(stat = "identity") +
    facet_grid(~Read)
```


## Retrieve and summarize insert sizes
```{r}
# Distance between read1 and read2 = distance between read2 and read1, so we only need to view the insert size of read1
insert_size <- mcols(read1)$isize |> abs()
# insert_size_2 <- elementMetadata(read1)$isize |> abs()
insert_size
length(insert_size)
# [1] 4348926

# *How our insert sizes distribute?
(insert_size_dist <- table(insert_size))

# Visualize the distribution
plot_df <- data.frame(Insert_size = as.numeric(names(insert_size_dist)),
                      Count = as.numeric(insert_size_dist)) # as.numeric only extracts the frequency values in a frequency table
(insert_size_plot <- plot_df |>
  ggplot(mapping = aes(x = Insert_size, y = Count)) +
  geom_line())

# Use the log2 transformation to the counts on y-axis to show the nucleosome patterning
(insert_size_plot +
    scale_y_continuous(trans = "log2"))

# The log2 transformed count of <100 bp is too high -> the concentration of transposases is too high
# Recap: ATAC-seq (Assay for Transposase-Accessible Chromatin with high-throughput sequencing) is a method for determining chromatin accessibility across the genome. It utilizes a hyperactive Tn5 transposase to insert sequencing adapters into open chromatin regions. High-throughput sequencing then yields reads that indicate these regions of increased accessibility.

# Nucleosome free (<100 bp), mononucleosome (180-247 bp), dinucleosome (315-437 bp)
(insert_size_plot + scale_y_continuous(trans = "log2") +
  geom_vline(xintercept = c(180, 247),colour = "red") +
  geom_vline(xintercept = c(315, 437),colour="darkblue") +
  geom_vline(xintercept = c(100), colour = "darkgreen"))
```


## Subset ATACseq reads by varying insert sizes
```{r}
# Nucleosome free (<100 bp), mononucleosome (180-247 bp), dinucleosome (315-437 bp)
# Why 100, not 147? Make sure the open region does not overlap the nucleosome location
atac_reads_chr1X_free <- atac_reads_chr1X[insert_size < 100, ]
atac_reads_chr1X_mono <- atac_reads_chr1X[insert_size > 180 & insert_size < 240, ]
atac_reads_chr1X_di <- atac_reads_chr1X[insert_size > 315 & insert_size < 437, ]
```

```{r}
read1 <- first(atac_reads_mainchr)
insert_size <- mcols(read1)$isize |> abs()
atac_reads_mainchr_free <- atac_reads_mainchr[insert_size < 100, ]
```


## Save the subsetted ATACseq reads as BMA files
```{r}
sorted_bam
# [1] "./Sorted_ATAC_female_lung_bowtie2.bam"
free_bam <- gsub("\\.bam", "_chr1X_free\\.bam", sorted_bam) # "./Sorted_ATAC_female_lung_bowtie2_chr1X_free.bam"
mono_bam <- gsub("\\.bam", "_chr1X_mono\\.bam", sorted_bam)
di_bam <- gsub("\\.bam", "_chr1X_di\\.bam", sorted_bam)
full_bam <- gsub("\\.bam", "_chr1X\\.bam", sorted_bam)

library(rtracklayer)
export(object = atac_reads_chr1X_free, con = free_bam, format = "bam") # Save the object atac_reads_chr1X_free with the filename in `free_bam`
export(object = atac_reads_chr1X_mono, con = mono_bam, format = "bam")
export(object = atac_reads_chr1X_di, con = di_bam, format = "bam")
export(object = atac_reads_chr1X, con = full_bam, format = "bam")
```

```{r}
free_bam <- gsub("\\.bam", "_mainchr_free\\.bam", sorted_bam)
export(object = atac_reads_mainchr_free, con = free_bam, format = "bam")
```


## Reconstruct one full-length fragment from 2 paired reads using `granges`
```{r}
# Show the GAlignmentPairs object with 1 pair
atac_reads_chr1X[1, ]
# GAlignmentPairs object with 1 pair, strandMode=1, and 0 metadata columns:
#       seqnames strand :     ranges --      ranges
#          <Rle>  <Rle> :  <IRanges> --   <IRanges>
#   [1]     chr1      + : 9996-10084 -- 10277-10374

# Show the GRange object with 1 range, which collapses read 1 and read2 into one long read
atac_reads_chr1X_long <- granges(atac_reads_chr1X)
atac_reads_chr1X_long[1, ]
# GRanges object with 1 range and 0 metadata columns:
#       seqnames     ranges strand
#          <Rle>  <IRanges>  <Rle>
#   [1]     chr1 9996-10374      +
#   -------
#   seqinfo: 25 sequences from an unspecified genome
```


## [QC] Estimate the non-redundant fraction (NRF) of the full-length fragments
```{r}
duplicated(atac_reads_chr1X_long) |> length() # duplicated() returns a logical vector indicating which elements (rows) are duplicates.
# [1] 4348926
duplicated_frag <- sum(duplicated(atac_reads_chr1X_long))
duplicate_rate <- duplicated_frag/length(atac_reads_chr1X_long)
(non_redundant_fraction <- 1 - duplicate_rate) # [1] 0.7760803
```


## Construct an open region bigWig
```{r}
class(atac_reads_chr1X_long)
# [1] "GRanges"
# attr(,"package")
# [1] "GenomicRanges"
length(atac_reads_chr1X_long)
# [1] 4348926
open_region_bw <- gsub("\\.bam","_open_region\\.bw", sorted_bam)
coverage_bw <- coverage(atac_reads_chr1X_long,
                        weight = (10^6/length(atac_reads_chr1X_long))) # Normalize to total mapped reads
export.bw(coverage_bw, open_region_bw)
```