201-跟着官方文档来看看GRanges这个对象
刘小泽写于2020.8.5 今天跟着官方文档来看看GRanges这个对象
1 前言
内容来自:https://bioconductor.org/packages/release/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesIntroduction.html
GenomicRanges是Bioconductor各个项目都在使用的基因组坐标的存储方式,它基于IRanges 建立,目前为BSgenome、Rsamtools、ShortRead 、rtracklayer、GenomicFeatures、 GenomicAlignments、VariantAnnotation 等提供支持
安装加载
if (!require("BiocManager"))install.packages("BiocManager")
if (!require("GenomicRanges"))BiocManager::install("GenomicRanges")
library(GenomicRanges)
2 GRanges: Genomic Ranges
存储了一系列的基因组起始、终止坐标,可以为连续的结合位点、转录本、外显子等提供支持
gr <- GRanges(
seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10,
GC = seq(1, 0, length=10))
gr
#> GRanges object with 10 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> d chr2 104-114 * | 4 0.666666666666667
#> e chr1 105-115 * | 5 0.555555555555556
#> f chr1 106-116 + | 6 0.444444444444444
#> g chr3 107-117 + | 7 0.333333333333333
#> h chr3 108-118 + | 8 0.222222222222222
#> i chr3 109-119 - | 9 0.111111111111111
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
左边是基因组坐标(染色体名称、范围、正负链),右边是meta信息(score、GC等等)
2.1 认识这个数据
其中包含多少行
names(gr)
#> [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
# 或者
length(gr)
#> [1] 10
分别提取seqnames、ranges、strand
seqnames(gr)
#> factor-Rle of length 10 with 4 runs
#> Lengths: 1 3 2 4
#> Values : chr1 chr2 chr1 chr3
#> Levels(3): chr1 chr2 chr3
ranges(gr)
#> IRanges object with 10 ranges and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> a 101 111 11
#> b 102 112 11
#> c 103 113 11
#> d 104 114 11
#> e 105 115 11
#> f 106 116 11
#> g 107 117 11
#> h 108 118 11
#> i 109 119 11
#> j 110 120 11
strand(gr)
#> factor-Rle of length 10 with 5 runs
#> Lengths: 1 2 2 3 2
#> Values : - + * + -
#> Levels(3): + - *
一次性提取seqnames、ranges、strand
granges(gr)
#> GRanges object with 10 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> a chr1 101-111 -
#> b chr2 102-112 +
#> c chr2 103-113 +
#> d chr2 104-114 *
#> e chr1 105-115 *
#> f chr1 106-116 +
#> g chr3 107-117 +
#> h chr3 108-118 +
#> i chr3 109-119 -
#> j chr3 110-120 -
#> -------
#> seqinfo: 3 sequences from an unspecified genome; no seqlengths
一次性提取meta信息
得到一个数据框
mcols(gr)
#> DataFrame with 10 rows and 2 columns
#> score GC
#> <integer> <numeric>
#> a 1 1
#> b 2 0.888888888888889
#> c 3 0.777777777777778
#> d 4 0.666666666666667
#> e 5 0.555555555555556
#> f 6 0.444444444444444
#> g 7 0.333333333333333
#> h 8 0.222222222222222
#> i 9 0.111111111111111
#> j 10 0
如果只想提取其中的某部分
mcols(gr)$score
#> [1] 1 2 3 4 5 6 7 8 9 10
还可以加入染色体长度
seqlengths(gr) <- c(249250621, 243199373, 198022430)
# 提取
seqlengths(gr)
#> chr1 chr2 chr3
#> 249250621 243199373 198022430
2.2 拆分+取子集
拆分
利用split
会得到一个GRangesList对象
sp <- split(gr, rep(1:2, each=5))
sp
#> GRangesList object of length 2:
#> $`1`
#> GRanges object with 5 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> d chr2 104-114 * | 4 0.666666666666667
#> e chr1 105-115 * | 5 0.555555555555556
#> -------
#> seqinfo: 3 sequences from an unspecified genome
#>
#> $`2`
#> GRanges object with 5 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> f chr1 106-116 + | 6 0.444444444444444
#> g chr3 107-117 + | 7 0.333333333333333
#> h chr3 108-118 + | 8 0.222222222222222
#> i chr3 109-119 - | 9 0.111111111111111
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
如果要再连起来
c(sp[[1]], sp[[2]])
#> GRanges object with 10 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> d chr2 104-114 * | 4 0.666666666666667
#> e chr1 105-115 * | 5 0.555555555555556
#> f chr1 106-116 + | 6 0.444444444444444
#> g chr3 107-117 + | 7 0.333333333333333
#> h chr3 108-118 + | 8 0.222222222222222
#> i chr3 109-119 - | 9 0.111111111111111
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
简单取子集
操作和向量的操作类似
gr[2:3]
#> GRanges object with 2 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> -------
#> seqinfo: 3 sequences from an unspecified genome
如果要再选取一些meta信息,操作又和数据框类似
gr[2:3, "GC"]
#> GRanges object with 2 ranges and 1 metadata column:
#> seqnames ranges strand | GC
#> <Rle> <IRanges> <Rle> | <numeric>
#> b chr2 102-112 + | 0.888888888888889
#> c chr2 103-113 + | 0.777777777777778
#> -------
#> seqinfo: 3 sequences from an unspecified genome
对子集重新赋值
# 先把全部的行拆成一个列表,然后把第一行赋值给第二行
singles <- split(gr, names(gr))
grMod <- gr
grMod[2] <- singles[[1]]
head(grMod, n=3)
#> GRanges object with 3 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr1 101-111 - | 1 1
#> c chr2 103-113 + | 3 0.777777777777778
#> -------
#> seqinfo: 3 sequences from an unspecified genome
还有重复、翻转、取特定区域等操作
# 重复
rep(singles[[2]], times = 3)
#> GRanges object with 3 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> b chr2 102-112 + | 2 0.888888888888889
#> b chr2 102-112 + | 2 0.888888888888889
#> b chr2 102-112 + | 2 0.888888888888889
#> -------
#> seqinfo: 3 sequences from an unspecified genome
# 翻转
rev(gr)
#> GRanges object with 10 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> j chr3 110-120 - | 10 0
#> i chr3 109-119 - | 9 0.111111111111111
#> h chr3 108-118 + | 8 0.222222222222222
#> g chr3 107-117 + | 7 0.333333333333333
#> f chr1 106-116 + | 6 0.444444444444444
#> e chr1 105-115 * | 5 0.555555555555556
#> d chr2 104-114 * | 4 0.666666666666667
#> c chr2 103-113 + | 3 0.777777777777778
#> b chr2 102-112 + | 2 0.888888888888889
#> a chr1 101-111 - | 1 1
#> -------
#> seqinfo: 3 sequences from an unspecified genome
# 前后两行
head(gr,n=2)
#> GRanges object with 2 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> -------
#> seqinfo: 3 sequences from an unspecified genome
tail(gr,n=2)
#> GRanges object with 2 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> i chr3 109-119 - | 9 0.111111111111111
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
# 取第2-4行
window(gr, start=2,end=4)
#> GRanges object with 3 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> d chr2 104-114 * | 4 0.666666666666667
#> -------
#> seqinfo: 3 sequences from an unspecified genome
# 取2-3行 + 7-9行
gr[IRanges(start=c(2,7), end=c(3,9))]
#> GRanges object with 5 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> g chr3 107-117 + | 7 0.333333333333333
#> h chr3 108-118 + | 8 0.222222222222222
#> i chr3 109-119 - | 9 0.111111111111111
#> -------
#> seqinfo: 3 sequences from an unspecified genome
3 对区间操作
3.1 intra-range(同一个GRange中;行内部调整)
获取数据
g <- gr[1:3]
g <- append(g, singles[[10]])
start(g)
#> [1] 101 102 103 110
end(g)
#> [1] 111 112 113 120
width(g)
#> [1] 11 11 11 11
# 如果存在两行包含的关系,range就会合并显示
range(g)
#> GRanges object with 3 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 101-111 -
#> [2] chr2 102-113 +
#> [3] chr3 110-120 -
#> -------
#> seqinfo: 3 sequences from an unspecified genome
g
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
对侧翼操作
# 取上游10bp(区分正负链)
flank(g, 10)
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 112-121 - | 1 1
#> b chr2 92-101 + | 2 0.888888888888889
#> c chr2 93-102 + | 3 0.777777777777778
#> j chr3 121-130 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
# 取下游10bp(区分正负链)
flank(g, 10, start=FALSE)
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 91-100 - | 1 1
#> b chr2 113-122 + | 2 0.888888888888889
#> c chr2 114-123 + | 3 0.777777777777778
#> j chr3 100-109 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
整体移动
# 全部移动5bp(不分正负链)
shift(g, 5)
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 106-116 - | 1 1
#> b chr2 107-117 + | 2 0.888888888888889
#> c chr2 108-118 + | 3 0.777777777777778
#> j chr3 115-125 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
直接修改区间范围
# 全部修改为30bp(区分正负链;默认固定start)
resize(g, 30,fix="start")
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 82-111 - | 1 1
#> b chr2 102-131 + | 2 0.888888888888889
#> c chr2 103-132 + | 3 0.777777777777778
#> j chr3 91-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
3.2 inter-range (同一个GRange中;行与行之间调整)
g
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
将重叠区域合并
reduce(g)
#> GRanges object with 3 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 101-111 -
#> [2] chr2 102-113 +
#> [3] chr3 110-120 -
#> -------
#> seqinfo: 3 sequences from an unspecified genome
得到没有任何重叠的结果
# 如果有重叠,则单独列出来
disjoin(g)
#> GRanges object with 5 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 101-111 -
#> [2] chr2 102 +
#> [3] chr2 103-112 +
#> [4] chr2 113 +
#> [5] chr3 110-120 -
#> -------
#> seqinfo: 3 sequences from an unspecified genome
对没有任何重叠的结果数量进行统计
coverage(g)
#> RleList of length 3
#> $chr1
#> integer-Rle of length 249250621 with 3 runs
#> Lengths: 100 11 249250510
#> Values : 0 1 0
#>
#> $chr2
#> integer-Rle of length 243199373 with 5 runs
#> Lengths: 101 1 10 1 243199260
#> Values : 0 1 2 1 0
#>
#> $chr3
#> integer-Rle of length 198022430 with 3 runs
#> Lengths: 109 11 198022310
#> Values : 0 1 0
看GRange对象以外的区域
# 也就是每个染色体长度除去了g包含的坐标
gaps(g)
#> GRanges object with 12 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 1-249250621 +
#> [2] chr1 1-100 -
#> [3] chr1 112-249250621 -
#> [4] chr1 1-249250621 *
#> [5] chr2 1-101 +
#> ... ... ... ...
#> [8] chr2 1-243199373 *
#> [9] chr3 1-198022430 +
#> [10] chr3 1-109 -
#> [11] chr3 121-198022430 -
#> [12] chr3 1-198022430 *
#> -------
#> seqinfo: 3 sequences from an unspecified genome
3.3 between-range (两个GRange对象之间)
重点操作是:findOverlaps
、union
、intersect
、setdiff
数据准备
g2 <- head(gr, n=2)
g
#> GRanges object with 4 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> c chr2 103-113 + | 3 0.777777777777778
#> j chr3 110-120 - | 10 0
#> -------
#> seqinfo: 3 sequences from an unspecified genome
g2
#> GRanges object with 2 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> a chr1 101-111 - | 1 1
#> b chr2 102-112 + | 2 0.888888888888889
#> -------
#> seqinfo: 3 sequences from an unspecified genome
并集
union(g, g2)
#> GRanges object with 3 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 101-111 -
#> [2] chr2 102-113 +
#> [3] chr3 110-120 -
#> -------
#> seqinfo: 3 sequences from an unspecified genome
交集
intersect(g, g2)
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr1 101-111 -
#> [2] chr2 102-112 +
#> -------
#> seqinfo: 3 sequences from an unspecified genome
补集
setdiff(g, g2)
#> GRanges object with 2 ranges and 0 metadata columns:
#> seqnames ranges strand
#> <Rle> <IRanges> <Rle>
#> [1] chr2 113 +
#> [2] chr3 110-120 -
#> -------
#> seqinfo: 3 sequences from an unspecified genome
findOverlaps
数据一:https://raw.githubusercontent.com/Bioconductor/BiocWorkshops/master/100_Morgan_RBiocForAll/CpGislands.Hsapiens.hg38.UCSC.bed
suppressMessages(library("rtracklayer"))
library(stringr)
fname='CpGislands.Hsapiens.hg38.UCSC.bed.txt'
# 重命名,不需要后缀.txt
aft_name <- gsub('.txt','',fname)
file.rename(fname,aft_name)
#> Warning in file.rename(fname, aft_name): cannot rename file 'CpGislands.Hsapiens.hg38.UCSC.bed.txt' to
#> 'CpGislands.Hsapiens.hg38.UCSC.bed', reason 'No such file or directory'
#> [1] FALSE
cpg <- import(aft_name)
# 原本BED文件是0-based,半开半闭区间,比如[0,10) = 0..9十个数
# 而import()函数自动将BED的类型转成Bioconductor统一类型:1-based、全闭区间,也就是将[0,10)变成了[1,10],还是原来的十个数值
head(cpg)
#> GRanges object with 6 ranges and 1 metadata column:
#> seqnames ranges strand | name
#> <Rle> <IRanges> <Rle> | <character>
#> [1] chr1 155188537-155192004 * | CpG:_361
#> [2] chr1 2226774-2229734 * | CpG:_366
#> [3] chr1 36306230-36307408 * | CpG:_110
#> [4] chr1 47708823-47710847 * | CpG:_164
#> [5] chr1 53737730-53739637 * | CpG:_221
#> [6] chr1 144179072-144179313 * | CpG:_20
#> -------
#> seqinfo: 254 sequences from an unspecified genome; no seqlengths
seqnames(cpg)
#> factor-Rle of length 30477 with 254 runs
#> Lengths: 2535 1682 ... 1 2
#> Values : chr1 chr2 ... chr22_KI270735v1_random chr22_KI270738v1_random
#> Levels(254): chr1 chr2 chr3 chr4 ... chr22_KI270734v1_random chr22_KI270735v1_random chr22_KI270738v1_random
# 只对标准的染色体1-22+X+Y进行处理
cpg <- keepStandardChromosomes(cpg, pruning.mode = "coarse")
head( start(cpg) )
#> [1] 155188537 2226774 36306230 47708823 53737730 144179072
hist(log10(width(cpg)))
# 取子集
subset(cpg, seqnames %in% c("chr1", "chr2"))
#> GRanges object with 4217 ranges and 1 metadata column:
#> seqnames ranges strand | name
#> <Rle> <IRanges> <Rle> | <character>
#> [1] chr1 155188537-155192004 * | CpG:_361
#> [2] chr1 2226774-2229734 * | CpG:_366
#> [3] chr1 36306230-36307408 * | CpG:_110
#> [4] chr1 47708823-47710847 * | CpG:_164
#> [5] chr1 53737730-53739637 * | CpG:_221
#> ... ... ... ... . ...
#> [4213] chr2 242003256-242004412 * | CpG:_79
#> [4214] chr2 242006590-242010686 * | CpG:_263
#> [4215] chr2 242045491-242045723 * | CpG:_16
#> [4216] chr2 242046615-242047706 * | CpG:_170
#> [4217] chr2 242088150-242089411 * | CpG:_149
#> -------
#> seqinfo: 24 sequences from an unspecified genome; no seqlengths
数据二:TxDb.Hsapiens.UCSC.hg38.knownGene
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
tx <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
# 如果取外显子坐标:exons(TxDb.Hsapiens.UCSC.hg38.knownGene)
# 如果取基因坐标:genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
tx
#> GRanges object with 247541 ranges and 2 metadata columns:
#> seqnames ranges strand | tx_id tx_name
#> <Rle> <IRanges> <Rle> | <integer> <character>
#> [1] chr1 11869-14409 + | 1 ENST00000456328.2
#> [2] chr1 12010-13670 + | 2 ENST00000450305.2
#> [3] chr1 29554-31097 + | 3 ENST00000473358.1
#> [4] chr1 30267-31109 + | 4 ENST00000469289.1
#> [5] chr1 30366-30503 + | 5 ENST00000607096.1
#> ... ... ... ... . ... ...
#> [247537] chrUn_GL000220v1 155997-156149 + | 247537 ENST00000619779.1
#> [247538] chrUn_KI270442v1 380608-380726 + | 247538 ENST00000620265.1
#> [247539] chrUn_KI270442v1 217250-217401 - | 247539 ENST00000611690.1
#> [247540] chrUn_KI270744v1 51009-51114 - | 247540 ENST00000616830.1
#> [247541] chrUn_KI270750v1 148668-148843 + | 247541 ENST00000612925.1
#> -------
#> seqinfo: 595 sequences (1 circular) from hg38 genome
# 同样也是保留标准染色体信息
tx <- keepStandardChromosomes(tx, pruning.mode="coarse")
seqnames(tx)
#> factor-Rle of length 227462 with 25 runs
#> Lengths: 19915 16586 14251 9364 10786 10427 10798 9307 ... 4437 13623 5403 2925 4920 6970 978 37
#> Values : chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 ... chr18 chr19 chr20 chr21 chr22 chrX chrY chrM
#> Levels(25): chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 ... chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM
开始取交集
findOverlaps(tx,cpg)
#> Hits object with 140077 hits and 0 metadata columns:
#> queryHits subjectHits
#> <integer> <integer>
#> [1] 3 10
#> [2] 32 18
#> [3] 33 18
#> [4] 34 18
#> [5] 35 18
#> ... ... ...
#> [140073] 227385 13824
#> [140074] 227386 13823
#> [140075] 227386 13824
#> [140076] 227419 13829
#> [140077] 227424 13831
#> -------
#> queryLength: 227462 / subjectLength: 27949
# 统计每个转录本上有多少CpG位点
olaps <- countOverlaps(tx, cpg)
length(olaps)
#> [1] 227462
table(olaps)
#> olaps
#> 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
#> 126181 81273 12621 3804 1557 713 427 266 175 96 68 42 43 33 25 22
#> 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
#> 20 7 14 7 7 3 7 6 3 1 4 1 3 1 6 5
#> 32 33 34 35 36 37 38 63 65 71
#> 3 3 1 2 3 2 1 4 1 1
# 添加结果到GRanges
tx$cpgOverlaps <- olaps
tx
#> GRanges object with 227462 ranges and 3 metadata columns:
#> seqnames ranges strand | tx_id tx_name cpgOverlaps
#> <Rle> <IRanges> <Rle> | <integer> <character> <integer>
#> [1] chr1 11869-14409 + | 1 ENST00000456328.2 0
#> [2] chr1 12010-13670 + | 2 ENST00000450305.2 0
#> [3] chr1 29554-31097 + | 3 ENST00000473358.1 1
#> [4] chr1 30267-31109 + | 4 ENST00000469289.1 0
#> [5] chr1 30366-30503 + | 5 ENST00000607096.1 0
#> ... ... ... ... . ... ... ...
#> [227458] chrM 5826-5891 - | 227458 ENST00000387409.1 0
#> [227459] chrM 7446-7514 - | 227459 ENST00000387416.2 0
#> [227460] chrM 14149-14673 - | 227460 ENST00000361681.2 0
#> [227461] chrM 14674-14742 - | 227461 ENST00000387459.1 0
#> [227462] chrM 15956-16023 - | 227462 ENST00000387461.2 0
#> -------
#> seqinfo: 25 sequences (1 circular) from hg38 genome
# 根据这个结果取子集
subset(tx, cpgOverlaps > 10)
#> GRanges object with 281 ranges and 3 metadata columns:
#> seqnames ranges strand | tx_id tx_name cpgOverlaps
#> <Rle> <IRanges> <Rle> | <integer> <character> <integer>
#> [1] chr1 2050411-2185395 + | 292 ENST00000378567.8 15
#> [2] chr1 2050485-2146108 + | 293 ENST00000468310.5 11
#> [3] chr1 2073462-2185390 + | 298 ENST00000400921.6 13
#> [4] chr1 2073986-2185190 + | 300 ENST00000461106.6 13
#> [5] chr1 3069168-3434342 + | 382 ENST00000511072.5 32
#> ... ... ... ... . ... ... ...
#> [277] chrX 40051246-40177329 - | 223668 ENST00000378455.8 11
#> [278] chrX 40051248-40177329 - | 223669 ENST00000342274.8 11
#> [279] chrX 40062955-40177083 - | 223675 ENST00000406200.3 11
#> [280] chrY 333933-386907 - | 226968 ENST00000390665.9 14
#> [281] chrY 344896-386955 - | 226974 ENST00000445792.7 12
#> -------
#> seqinfo: 25 sequences (1 circular) from hg38 genome
4 更复杂的GRangesList
多个GRanges对象放到一个列表组成了 GRangesList
4.1 构建
gr1 <- GRanges(
seqnames = "chr2",
ranges = IRanges(103, 106),
strand = "+",
score = 5L, GC = 0.45)
gr2 <- GRanges(
seqnames = c("chr1", "chr1"),
ranges = IRanges(c(107, 113), width = 3),
strand = c("+", "-"),
score = 3:4, GC = c(0.3, 0.5))
grl <- GRangesList("txA" = gr1, "txB" = gr2)
grl
#> GRangesList object of length 2:
#> $txA
#> GRanges object with 1 range and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> [1] chr2 103-106 + | 5 0.45
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $txB
#> GRanges object with 2 ranges and 2 metadata columns:
#> seqnames ranges strand | score GC
#> <Rle> <IRanges> <Rle> | <integer> <numeric>
#> [1] chr1 107-109 + | 3 0.3
#> [2] chr1 113-115 - | 4 0.5
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
4.2 基本操作
seqnames(grl)
#> RleList of length 2
#> $txA
#> factor-Rle of length 1 with 1 run
#> Lengths: 1
#> Values : chr2
#> Levels(2): chr2 chr1
#>
#> $txB
#> factor-Rle of length 2 with 1 run
#> Lengths: 2
#> Values : chr1
#> Levels(2): chr2 chr1
ranges(grl)
#> IRangesList object of length 2:
#> $txA
#> IRanges object with 1 range and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> [1] 103 106 4
#>
#> $txB
#> IRanges object with 2 ranges and 0 metadata columns:
#> start end width
#> <integer> <integer> <integer>
#> [1] 107 109 3
#> [2] 113 115 3
strand(grl)
#> RleList of length 2
#> $txA
#> factor-Rle of length 1 with 1 run
#> Lengths: 1
#> Values : +
#> Levels(3): + - *
#>
#> $txB
#> factor-Rle of length 2 with 2 runs
#> Lengths: 1 1
#> Values : + -
#> Levels(3): + - *
length(grl)
#> [1] 2
names(grl)
#> [1] "txA" "txB"
seqlengths(grl)
#> chr2 chr1
#> NA NA
isEmpty(grl)
#> [1] FALSE
# 提取meta信息,需要先unlist
mcols(grl) #没结果
#> DataFrame with 2 rows and 0 columns
mcols(unlist(grl))
#> DataFrame with 3 rows and 2 columns
#> score GC
#> <integer> <numeric>
#> txA 5 0.45
#> txB 3 0.3
#> txB 4 0.5
4.3 列表的循环操作
lapply(grl, length)
#> $txA
#> [1] 1
#>
#> $txB
#> [1] 2
sapply(grl, length)
#> txA txB
#> 1 2
还可以先unlist,计算完重新list
gr <- unlist(grl)
gr$log_score <- log(gr$score)
grl <- relist(gr, grl)
grl
#> GRangesList object of length 2:
#> $txA
#> GRanges object with 1 range and 3 metadata columns:
#> seqnames ranges strand | score GC log_score
#> <Rle> <IRanges> <Rle> | <integer> <numeric> <numeric>
#> txA chr2 103-106 + | 5 0.45 1.6094379124341
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths
#>
#> $txB
#> GRanges object with 2 ranges and 3 metadata columns:
#> seqnames ranges strand | score GC log_score
#> <Rle> <IRanges> <Rle> | <integer> <numeric> <numeric>
#> txB chr1 107-109 + | 3 0.3 1.09861228866811
#> txB chr1 113-115 - | 4 0.5 1.38629436111989
#> -------
#> seqinfo: 2 sequences from an unspecified genome; no seqlengths