201-跟着官方文档来看看GRanges这个对象

Aug 5, 2020 11 min read cnposts

刘小泽写于2020.8.5 今天跟着官方文档来看看GRanges这个对象

1 前言

内容来自：https://bioconductor.org/packages/release/bioc/vignettes/GenomicRanges/inst/doc/GenomicRangesIntroduction.html

GenomicRanges是Bioconductor各个项目都在使用的基因组坐标的存储方式，它基于IRanges 建立，目前为BSgenome、Rsamtools、ShortRead 、rtracklayer、GenomicFeatures、 GenomicAlignments、VariantAnnotation 等提供支持

安装加载

if (!require("BiocManager"))install.packages("BiocManager")
if (!require("GenomicRanges"))BiocManager::install("GenomicRanges")

library(GenomicRanges)

2 GRanges: Genomic Ranges

存储了一系列的基因组起始、终止坐标，可以为连续的结合位点、转录本、外显子等提供支持

gr <- GRanges(
    seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
    ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
    strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
    score = 1:10,
    GC = seq(1, 0, length=10))
gr
#> GRanges object with 10 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   d     chr2   104-114      * |         4 0.666666666666667
#>   e     chr1   105-115      * |         5 0.555555555555556
#>   f     chr1   106-116      + |         6 0.444444444444444
#>   g     chr3   107-117      + |         7 0.333333333333333
#>   h     chr3   108-118      + |         8 0.222222222222222
#>   i     chr3   109-119      - |         9 0.111111111111111
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths

左边是基因组坐标（染色体名称、范围、正负链），右边是meta信息（score、GC等等）

2.1 认识这个数据

其中包含多少行

names(gr)
#>  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
# 或者
length(gr)
#> [1] 10

分别提取seqnames、ranges、strand

seqnames(gr)
#> factor-Rle of length 10 with 4 runs
#>   Lengths:    1    3    2    4
#>   Values : chr1 chr2 chr1 chr3
#> Levels(3): chr1 chr2 chr3

ranges(gr)
#> IRanges object with 10 ranges and 0 metadata columns:
#>         start       end     width
#>     <integer> <integer> <integer>
#>   a       101       111        11
#>   b       102       112        11
#>   c       103       113        11
#>   d       104       114        11
#>   e       105       115        11
#>   f       106       116        11
#>   g       107       117        11
#>   h       108       118        11
#>   i       109       119        11
#>   j       110       120        11

strand(gr)
#> factor-Rle of length 10 with 5 runs
#>   Lengths: 1 2 2 3 2
#>   Values : - + * + -
#> Levels(3): + - *

一次性提取seqnames、ranges、strand

granges(gr)
#> GRanges object with 10 ranges and 0 metadata columns:
#>     seqnames    ranges strand
#>        <Rle> <IRanges>  <Rle>
#>   a     chr1   101-111      -
#>   b     chr2   102-112      +
#>   c     chr2   103-113      +
#>   d     chr2   104-114      *
#>   e     chr1   105-115      *
#>   f     chr1   106-116      +
#>   g     chr3   107-117      +
#>   h     chr3   108-118      +
#>   i     chr3   109-119      -
#>   j     chr3   110-120      -
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome; no seqlengths

一次性提取meta信息

得到一个数据框

mcols(gr)
#> DataFrame with 10 rows and 2 columns
#>       score                GC
#>   <integer>         <numeric>
#> a         1                 1
#> b         2 0.888888888888889
#> c         3 0.777777777777778
#> d         4 0.666666666666667
#> e         5 0.555555555555556
#> f         6 0.444444444444444
#> g         7 0.333333333333333
#> h         8 0.222222222222222
#> i         9 0.111111111111111
#> j        10                 0

如果只想提取其中的某部分

mcols(gr)$score
#>  [1]  1  2  3  4  5  6  7  8  9 10

还可以加入染色体长度

seqlengths(gr) <- c(249250621, 243199373, 198022430)
# 提取
seqlengths(gr)
#>      chr1      chr2      chr3 
#> 249250621 243199373 198022430

2.2 拆分+取子集

拆分

利用split会得到一个GRangesList对象

sp <- split(gr, rep(1:2, each=5))
sp
#> GRangesList object of length 2:
#> $`1`
#> GRanges object with 5 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   d     chr2   104-114      * |         4 0.666666666666667
#>   e     chr1   105-115      * |         5 0.555555555555556
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
#> 
#> $`2`
#> GRanges object with 5 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   f     chr1   106-116      + |         6 0.444444444444444
#>   g     chr3   107-117      + |         7 0.333333333333333
#>   h     chr3   108-118      + |         8 0.222222222222222
#>   i     chr3   109-119      - |         9 0.111111111111111
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

如果要再连起来

c(sp[[1]], sp[[2]])
#> GRanges object with 10 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   d     chr2   104-114      * |         4 0.666666666666667
#>   e     chr1   105-115      * |         5 0.555555555555556
#>   f     chr1   106-116      + |         6 0.444444444444444
#>   g     chr3   107-117      + |         7 0.333333333333333
#>   h     chr3   108-118      + |         8 0.222222222222222
#>   i     chr3   109-119      - |         9 0.111111111111111
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

简单取子集

操作和向量的操作类似

gr[2:3]
#> GRanges object with 2 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

如果要再选取一些meta信息，操作又和数据框类似

gr[2:3, "GC"]
#> GRanges object with 2 ranges and 1 metadata column:
#>     seqnames    ranges strand |                GC
#>        <Rle> <IRanges>  <Rle> |         <numeric>
#>   b     chr2   102-112      + | 0.888888888888889
#>   c     chr2   103-113      + | 0.777777777777778
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

对子集重新赋值

# 先把全部的行拆成一个列表，然后把第一行赋值给第二行
singles <- split(gr, names(gr))
grMod <- gr
grMod[2] <- singles[[1]]
head(grMod, n=3)
#> GRanges object with 3 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr1   101-111      - |         1                 1
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

还有重复、翻转、取特定区域等操作

# 重复
rep(singles[[2]], times = 3)
#> GRanges object with 3 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
# 翻转
rev(gr)
#> GRanges object with 10 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   j     chr3   110-120      - |        10                 0
#>   i     chr3   109-119      - |         9 0.111111111111111
#>   h     chr3   108-118      + |         8 0.222222222222222
#>   g     chr3   107-117      + |         7 0.333333333333333
#>   f     chr1   106-116      + |         6 0.444444444444444
#>   e     chr1   105-115      * |         5 0.555555555555556
#>   d     chr2   104-114      * |         4 0.666666666666667
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   a     chr1   101-111      - |         1                 1
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
# 前后两行
head(gr,n=2)
#> GRanges object with 2 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
tail(gr,n=2)
#> GRanges object with 2 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   i     chr3   109-119      - |         9 0.111111111111111
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
# 取第2-4行
window(gr, start=2,end=4)
#> GRanges object with 3 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   d     chr2   104-114      * |         4 0.666666666666667
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
# 取2-3行 + 7-9行
gr[IRanges(start=c(2,7), end=c(3,9))]
#> GRanges object with 5 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   g     chr3   107-117      + |         7 0.333333333333333
#>   h     chr3   108-118      + |         8 0.222222222222222
#>   i     chr3   109-119      - |         9 0.111111111111111
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

3 对区间操作

3.1 intra-range（同一个GRange中；行内部调整）

获取数据

g <- gr[1:3]
g <- append(g, singles[[10]])
start(g)
#> [1] 101 102 103 110
end(g)
#> [1] 111 112 113 120
width(g)
#> [1] 11 11 11 11
# 如果存在两行包含的关系，range就会合并显示
range(g)
#> GRanges object with 3 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1   101-111      -
#>   [2]     chr2   102-113      +
#>   [3]     chr3   110-120      -
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
g
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

对侧翼操作

# 取上游10bp（区分正负链）
flank(g, 10)
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   112-121      - |         1                 1
#>   b     chr2    92-101      + |         2 0.888888888888889
#>   c     chr2    93-102      + |         3 0.777777777777778
#>   j     chr3   121-130      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
# 取下游10bp（区分正负链）
flank(g, 10, start=FALSE)
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1    91-100      - |         1                 1
#>   b     chr2   113-122      + |         2 0.888888888888889
#>   c     chr2   114-123      + |         3 0.777777777777778
#>   j     chr3   100-109      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

整体移动

# 全部移动5bp（不分正负链）
shift(g, 5)
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   106-116      - |         1                 1
#>   b     chr2   107-117      + |         2 0.888888888888889
#>   c     chr2   108-118      + |         3 0.777777777777778
#>   j     chr3   115-125      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

直接修改区间范围

# 全部修改为30bp（区分正负链；默认固定start）
resize(g, 30,fix="start")
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1    82-111      - |         1                 1
#>   b     chr2   102-131      + |         2 0.888888888888889
#>   c     chr2   103-132      + |         3 0.777777777777778
#>   j     chr3    91-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

3.2 inter-range （同一个GRange中；行与行之间调整）

g
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

将重叠区域合并

reduce(g)
#> GRanges object with 3 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1   101-111      -
#>   [2]     chr2   102-113      +
#>   [3]     chr3   110-120      -
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

得到没有任何重叠的结果

# 如果有重叠，则单独列出来
disjoin(g)
#> GRanges object with 5 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1   101-111      -
#>   [2]     chr2       102      +
#>   [3]     chr2   103-112      +
#>   [4]     chr2       113      +
#>   [5]     chr3   110-120      -
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

对没有任何重叠的结果数量进行统计

coverage(g)
#> RleList of length 3
#> $chr1
#> integer-Rle of length 249250621 with 3 runs
#>   Lengths:       100        11 249250510
#>   Values :         0         1         0
#> 
#> $chr2
#> integer-Rle of length 243199373 with 5 runs
#>   Lengths:       101         1        10         1 243199260
#>   Values :         0         1         2         1         0
#> 
#> $chr3
#> integer-Rle of length 198022430 with 3 runs
#>   Lengths:       109        11 198022310
#>   Values :         0         1         0

看GRange对象以外的区域

# 也就是每个染色体长度除去了g包含的坐标
gaps(g)
#> GRanges object with 12 ranges and 0 metadata columns:
#>        seqnames        ranges strand
#>           <Rle>     <IRanges>  <Rle>
#>    [1]     chr1   1-249250621      +
#>    [2]     chr1         1-100      -
#>    [3]     chr1 112-249250621      -
#>    [4]     chr1   1-249250621      *
#>    [5]     chr2         1-101      +
#>    ...      ...           ...    ...
#>    [8]     chr2   1-243199373      *
#>    [9]     chr3   1-198022430      +
#>   [10]     chr3         1-109      -
#>   [11]     chr3 121-198022430      -
#>   [12]     chr3   1-198022430      *
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

3.3 between-range （两个GRange对象之间）

重点操作是：findOverlaps、union、intersect、setdiff

数据准备

g2 <- head(gr, n=2)
g
#> GRanges object with 4 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   c     chr2   103-113      + |         3 0.777777777777778
#>   j     chr3   110-120      - |        10                 0
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome
g2
#> GRanges object with 2 ranges and 2 metadata columns:
#>     seqnames    ranges strand |     score                GC
#>        <Rle> <IRanges>  <Rle> | <integer>         <numeric>
#>   a     chr1   101-111      - |         1                 1
#>   b     chr2   102-112      + |         2 0.888888888888889
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

并集

union(g, g2)
#> GRanges object with 3 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1   101-111      -
#>   [2]     chr2   102-113      +
#>   [3]     chr3   110-120      -
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

交集

intersect(g, g2)
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr1   101-111      -
#>   [2]     chr2   102-112      +
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

补集

setdiff(g, g2)
#> GRanges object with 2 ranges and 0 metadata columns:
#>       seqnames    ranges strand
#>          <Rle> <IRanges>  <Rle>
#>   [1]     chr2       113      +
#>   [2]     chr3   110-120      -
#>   -------
#>   seqinfo: 3 sequences from an unspecified genome

findOverlaps

数据一：https://raw.githubusercontent.com/Bioconductor/BiocWorkshops/master/100_Morgan_RBiocForAll/CpGislands.Hsapiens.hg38.UCSC.bed

suppressMessages(library("rtracklayer"))
library(stringr)
fname='CpGislands.Hsapiens.hg38.UCSC.bed.txt'
# 重命名，不需要后缀.txt
aft_name <- gsub('.txt','',fname)
file.rename(fname,aft_name)
#> Warning in file.rename(fname, aft_name): cannot rename file 'CpGislands.Hsapiens.hg38.UCSC.bed.txt' to
#> 'CpGislands.Hsapiens.hg38.UCSC.bed', reason 'No such file or directory'
#> [1] FALSE
cpg <- import(aft_name)
# 原本BED文件是0-based，半开半闭区间，比如[0,10) = 0..9十个数
# 而import()函数自动将BED的类型转成Bioconductor统一类型：1-based、全闭区间，也就是将[0,10)变成了[1,10]，还是原来的十个数值
head(cpg)
#> GRanges object with 6 ranges and 1 metadata column:
#>       seqnames              ranges strand |        name
#>          <Rle>           <IRanges>  <Rle> | <character>
#>   [1]     chr1 155188537-155192004      * |    CpG:_361
#>   [2]     chr1     2226774-2229734      * |    CpG:_366
#>   [3]     chr1   36306230-36307408      * |    CpG:_110
#>   [4]     chr1   47708823-47710847      * |    CpG:_164
#>   [5]     chr1   53737730-53739637      * |    CpG:_221
#>   [6]     chr1 144179072-144179313      * |     CpG:_20
#>   -------
#>   seqinfo: 254 sequences from an unspecified genome; no seqlengths
seqnames(cpg)
#> factor-Rle of length 30477 with 254 runs
#>   Lengths:                    2535                    1682 ...                       1                       2
#>   Values :                    chr1                    chr2 ... chr22_KI270735v1_random chr22_KI270738v1_random
#> Levels(254): chr1 chr2 chr3 chr4 ... chr22_KI270734v1_random chr22_KI270735v1_random chr22_KI270738v1_random
# 只对标准的染色体1-22+X+Y进行处理
cpg <- keepStandardChromosomes(cpg, pruning.mode = "coarse")
head( start(cpg) )
#> [1] 155188537   2226774  36306230  47708823  53737730 144179072
hist(log10(width(cpg)))

# 取子集
subset(cpg, seqnames %in% c("chr1", "chr2"))
#> GRanges object with 4217 ranges and 1 metadata column:
#>          seqnames              ranges strand |        name
#>             <Rle>           <IRanges>  <Rle> | <character>
#>      [1]     chr1 155188537-155192004      * |    CpG:_361
#>      [2]     chr1     2226774-2229734      * |    CpG:_366
#>      [3]     chr1   36306230-36307408      * |    CpG:_110
#>      [4]     chr1   47708823-47710847      * |    CpG:_164
#>      [5]     chr1   53737730-53739637      * |    CpG:_221
#>      ...      ...                 ...    ... .         ...
#>   [4213]     chr2 242003256-242004412      * |     CpG:_79
#>   [4214]     chr2 242006590-242010686      * |    CpG:_263
#>   [4215]     chr2 242045491-242045723      * |     CpG:_16
#>   [4216]     chr2 242046615-242047706      * |    CpG:_170
#>   [4217]     chr2 242088150-242089411      * |    CpG:_149
#>   -------
#>   seqinfo: 24 sequences from an unspecified genome; no seqlengths

数据二：TxDb.Hsapiens.UCSC.hg38.knownGene

library("TxDb.Hsapiens.UCSC.hg38.knownGene")
tx <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
# 如果取外显子坐标：exons(TxDb.Hsapiens.UCSC.hg38.knownGene)
# 如果取基因坐标：genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
tx
#> GRanges object with 247541 ranges and 2 metadata columns:
#>                    seqnames        ranges strand |     tx_id           tx_name
#>                       <Rle>     <IRanges>  <Rle> | <integer>       <character>
#>        [1]             chr1   11869-14409      + |         1 ENST00000456328.2
#>        [2]             chr1   12010-13670      + |         2 ENST00000450305.2
#>        [3]             chr1   29554-31097      + |         3 ENST00000473358.1
#>        [4]             chr1   30267-31109      + |         4 ENST00000469289.1
#>        [5]             chr1   30366-30503      + |         5 ENST00000607096.1
#>        ...              ...           ...    ... .       ...               ...
#>   [247537] chrUn_GL000220v1 155997-156149      + |    247537 ENST00000619779.1
#>   [247538] chrUn_KI270442v1 380608-380726      + |    247538 ENST00000620265.1
#>   [247539] chrUn_KI270442v1 217250-217401      - |    247539 ENST00000611690.1
#>   [247540] chrUn_KI270744v1   51009-51114      - |    247540 ENST00000616830.1
#>   [247541] chrUn_KI270750v1 148668-148843      + |    247541 ENST00000612925.1
#>   -------
#>   seqinfo: 595 sequences (1 circular) from hg38 genome

# 同样也是保留标准染色体信息
tx <- keepStandardChromosomes(tx, pruning.mode="coarse")
seqnames(tx)
#> factor-Rle of length 227462 with 25 runs
#>   Lengths: 19915 16586 14251  9364 10786 10427 10798  9307 ...  4437 13623  5403  2925  4920  6970   978    37
#>   Values :  chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8 ... chr18 chr19 chr20 chr21 chr22  chrX  chrY  chrM
#> Levels(25): chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 ... chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM

开始取交集

findOverlaps(tx,cpg)
#> Hits object with 140077 hits and 0 metadata columns:
#>            queryHits subjectHits
#>            <integer>   <integer>
#>        [1]         3          10
#>        [2]        32          18
#>        [3]        33          18
#>        [4]        34          18
#>        [5]        35          18
#>        ...       ...         ...
#>   [140073]    227385       13824
#>   [140074]    227386       13823
#>   [140075]    227386       13824
#>   [140076]    227419       13829
#>   [140077]    227424       13831
#>   -------
#>   queryLength: 227462 / subjectLength: 27949

# 统计每个转录本上有多少CpG位点
olaps <- countOverlaps(tx, cpg)
length(olaps)
#> [1] 227462
table(olaps)
#> olaps
#>      0      1      2      3      4      5      6      7      8      9     10     11     12     13     14     15 
#> 126181  81273  12621   3804   1557    713    427    266    175     96     68     42     43     33     25     22 
#>     16     17     18     19     20     21     22     23     24     25     26     27     28     29     30     31 
#>     20      7     14      7      7      3      7      6      3      1      4      1      3      1      6      5 
#>     32     33     34     35     36     37     38     63     65     71 
#>      3      3      1      2      3      2      1      4      1      1

# 添加结果到GRanges
tx$cpgOverlaps <- olaps
tx
#> GRanges object with 227462 ranges and 3 metadata columns:
#>            seqnames      ranges strand |     tx_id           tx_name cpgOverlaps
#>               <Rle>   <IRanges>  <Rle> | <integer>       <character>   <integer>
#>        [1]     chr1 11869-14409      + |         1 ENST00000456328.2           0
#>        [2]     chr1 12010-13670      + |         2 ENST00000450305.2           0
#>        [3]     chr1 29554-31097      + |         3 ENST00000473358.1           1
#>        [4]     chr1 30267-31109      + |         4 ENST00000469289.1           0
#>        [5]     chr1 30366-30503      + |         5 ENST00000607096.1           0
#>        ...      ...         ...    ... .       ...               ...         ...
#>   [227458]     chrM   5826-5891      - |    227458 ENST00000387409.1           0
#>   [227459]     chrM   7446-7514      - |    227459 ENST00000387416.2           0
#>   [227460]     chrM 14149-14673      - |    227460 ENST00000361681.2           0
#>   [227461]     chrM 14674-14742      - |    227461 ENST00000387459.1           0
#>   [227462]     chrM 15956-16023      - |    227462 ENST00000387461.2           0
#>   -------
#>   seqinfo: 25 sequences (1 circular) from hg38 genome
# 根据这个结果取子集
subset(tx, cpgOverlaps > 10)
#> GRanges object with 281 ranges and 3 metadata columns:
#>         seqnames            ranges strand |     tx_id           tx_name cpgOverlaps
#>            <Rle>         <IRanges>  <Rle> | <integer>       <character>   <integer>
#>     [1]     chr1   2050411-2185395      + |       292 ENST00000378567.8          15
#>     [2]     chr1   2050485-2146108      + |       293 ENST00000468310.5          11
#>     [3]     chr1   2073462-2185390      + |       298 ENST00000400921.6          13
#>     [4]     chr1   2073986-2185190      + |       300 ENST00000461106.6          13
#>     [5]     chr1   3069168-3434342      + |       382 ENST00000511072.5          32
#>     ...      ...               ...    ... .       ...               ...         ...
#>   [277]     chrX 40051246-40177329      - |    223668 ENST00000378455.8          11
#>   [278]     chrX 40051248-40177329      - |    223669 ENST00000342274.8          11
#>   [279]     chrX 40062955-40177083      - |    223675 ENST00000406200.3          11
#>   [280]     chrY     333933-386907      - |    226968 ENST00000390665.9          14
#>   [281]     chrY     344896-386955      - |    226974 ENST00000445792.7          12
#>   -------
#>   seqinfo: 25 sequences (1 circular) from hg38 genome

4 更复杂的GRangesList

多个GRanges对象放到一个列表组成了 GRangesList

4.1 构建

gr1 <- GRanges(
    seqnames = "chr2",
    ranges = IRanges(103, 106),
    strand = "+",
    score = 5L, GC = 0.45)
gr2 <- GRanges(
    seqnames = c("chr1", "chr1"),
    ranges = IRanges(c(107, 113), width = 3),
    strand = c("+", "-"),
    score = 3:4, GC = c(0.3, 0.5))
grl <- GRangesList("txA" = gr1, "txB" = gr2)
grl
#> GRangesList object of length 2:
#> $txA
#> GRanges object with 1 range and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <integer> <numeric>
#>   [1]     chr2   103-106      + |         5      0.45
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $txB
#> GRanges object with 2 ranges and 2 metadata columns:
#>       seqnames    ranges strand |     score        GC
#>          <Rle> <IRanges>  <Rle> | <integer> <numeric>
#>   [1]     chr1   107-109      + |         3       0.3
#>   [2]     chr1   113-115      - |         4       0.5
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths

4.2 基本操作

seqnames(grl)
#> RleList of length 2
#> $txA
#> factor-Rle of length 1 with 1 run
#>   Lengths:    1
#>   Values : chr2
#> Levels(2): chr2 chr1
#> 
#> $txB
#> factor-Rle of length 2 with 1 run
#>   Lengths:    2
#>   Values : chr1
#> Levels(2): chr2 chr1
ranges(grl)
#> IRangesList object of length 2:
#> $txA
#> IRanges object with 1 range and 0 metadata columns:
#>           start       end     width
#>       <integer> <integer> <integer>
#>   [1]       103       106         4
#> 
#> $txB
#> IRanges object with 2 ranges and 0 metadata columns:
#>           start       end     width
#>       <integer> <integer> <integer>
#>   [1]       107       109         3
#>   [2]       113       115         3
strand(grl)
#> RleList of length 2
#> $txA
#> factor-Rle of length 1 with 1 run
#>   Lengths: 1
#>   Values : +
#> Levels(3): + - *
#> 
#> $txB
#> factor-Rle of length 2 with 2 runs
#>   Lengths: 1 1
#>   Values : + -
#> Levels(3): + - *
length(grl)
#> [1] 2
names(grl)
#> [1] "txA" "txB"
seqlengths(grl)
#> chr2 chr1 
#>   NA   NA
isEmpty(grl)
#> [1] FALSE

# 提取meta信息，需要先unlist
mcols(grl) #没结果
#> DataFrame with 2 rows and 0 columns
mcols(unlist(grl))
#> DataFrame with 3 rows and 2 columns
#>         score        GC
#>     <integer> <numeric>
#> txA         5      0.45
#> txB         3       0.3
#> txB         4       0.5

4.3 列表的循环操作

lapply(grl, length)
#> $txA
#> [1] 1
#> 
#> $txB
#> [1] 2
sapply(grl, length)
#> txA txB 
#>   1   2

还可以先unlist，计算完重新list

gr <- unlist(grl)
gr$log_score <- log(gr$score)
grl <- relist(gr, grl)
grl
#> GRangesList object of length 2:
#> $txA
#> GRanges object with 1 range and 3 metadata columns:
#>       seqnames    ranges strand |     score        GC       log_score
#>          <Rle> <IRanges>  <Rle> | <integer> <numeric>       <numeric>
#>   txA     chr2   103-106      + |         5      0.45 1.6094379124341
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths
#> 
#> $txB
#> GRanges object with 2 ranges and 3 metadata columns:
#>       seqnames    ranges strand |     score        GC        log_score
#>          <Rle> <IRanges>  <Rle> | <integer> <numeric>        <numeric>
#>   txB     chr1   107-109      + |         3       0.3 1.09861228866811
#>   txB     chr1   113-115      - |         4       0.5 1.38629436111989
#>   -------
#>   seqinfo: 2 sequences from an unspecified genome; no seqlengths

R Bioconductor