scRNA-单细胞转录组学习笔记-19-使用monocle2分析文章数据
刘小泽写于19.9.5-第三单元第十一讲:使用monocle2分析文章数据 笔记目的:根据生信技能树的单细胞转录组课程探索smart-seq2技术相关的分析技术 课程链接在:http://jm.grazy.cn/index/mulitcourse/detail.html?cid=53
载入数据,创建对象
rm(list = ls())
Sys.setenv(R_MAX_NUM_DLLS=999)
## 首先载入文章的数据
load(file='../input.Rdata')
# 原始表达矩阵
counts=a
counts[1:4,1:4];dim(counts) # 2万多个基因,768个细胞(需要下一步进行过滤)
library(stringr)
# 样本信息
meta=df
> head(meta)
g plate n_g all
SS2_15_0048_A3 1 0048 2624 all
SS2_15_0048_A6 1 0048 2664 all
SS2_15_0048_A5 2 0048 3319 all
SS2_15_0048_A4 3 0048 4447 all
SS2_15_0048_A1 2 0048 4725 all
SS2_15_0048_A2 3 0048 5263 all
# 按行/基因检查:每个基因在多少细胞中有表达量
fivenum(apply(counts,1,function(x) sum(x>0) ))
boxplot(apply(counts,1,function(x) sum(x>0) ))
# 按列/样本检查:每个细胞中存在多少表达的基因
fivenum(apply(counts,2,function(x) sum(x>0) ))
hist(apply(counts,2,function(x) sum(x>0) ))
左图显示了:有50%的基因只在低于20个细胞中有表达量,还有许多没有表达量的:
> table(apply(counts,1,function(x) sum(x>0) )==0)
FALSE TRUE
17429 7153
# 存在7000多个基因在任何一个细胞中都没表达
右图显示了:大部分细胞都包含2000个以上的有表达的基因
开始使用newCellDataSet()
构建monocle对象:
# ---------首先构建基因的注释信息(feature_data)-----------
gene_ann <- data.frame(
gene_short_name = row.names(counts),
row.names = row.names(counts)
)
fd <- new("AnnotatedDataFrame",
data=gene_ann)
# ---------然后构建样本的注释信息(sample_data)---------
sample_ann=meta
pd <- new("AnnotatedDataFrame",
data=sample_ann)
# ---------开始构建对象---------
sc_cds <- newCellDataSet(
as.matrix(counts),
phenoData = pd,
featureData =fd,
expressionFamily = negbinomial.size(),
lowerDetectionLimit=1)
sc_cds
# CellDataSet (storageMode: environment)
# assayData: 24582 features, 768 samples
# element names: exprs
# protocolData: none
# phenoData
# sampleNames: SS2_15_0048_A3 SS2_15_0048_A6 ... SS2_15_0049_P24 (768 total)
# varLabels: g plate ... Size_Factor (5 total)
# varMetadata: labelDescription
# featureData
# featureNames: 0610005C13Rik 0610007P14Rik ... ERCC-00171 (24582 total)
# fvarLabels: gene_short_name
# fvarMetadata: labelDescription
# experimentData: use 'experimentData(object)'
# Annotation:
接下来质控过滤
=> detectGenes()
cds=sc_cds
cds
## 起初是 24582 features, 768 samples
#---------首先是对基因的过滤-------------
cds <- detectGenes(cds, min_expr = 0.1)
print(head(cds@featureData@data))
expressed_genes <- row.names(subset(cds@featureData@data,
num_cells_expressed >= 5))
length(expressed_genes)
# 14442
# 这里需要去掉ERCC基因
# 去掉ERCC基因
is.ercc <- grepl("ERCC",expressed_genes)
length(expressed_genes[!is.ercc])
# 14362(看到去掉了80个ERCC)
cds <- cds[expressed_genes[!is.ercc],]
cds
# 过滤基因后是 14362 features, 768 samples
#---------然后是对细胞的过滤-------------
# 如果不支持使用pData()函数,可以使用cds@phenoData@data来获得各种细胞注释信息
cell_anno <- cds@phenoData@data
> head(cell_anno)
g plate n_g all Size_Factor num_genes_expressed
SS2_15_0048_A3 1 0048 2624 all 0.6693919 3069
SS2_15_0048_A6 1 0048 2664 all 0.6820532 3040
SS2_15_0048_A5 2 0048 3319 all 1.0759418 3743
SS2_15_0048_A4 3 0048 4447 all 0.7294812 5014
SS2_15_0048_A1 2 0048 4725 all 1.5658507 5128
SS2_15_0048_A2 3 0048 5263 all 1.6187569 5693
# 这里简单过滤细胞
valid_cells <- row.names(cell_anno[cell_anno$num_genes_expressed>2000,] )
cds <- cds[,valid_cells]
cds
# 最后剩下:14362 features, 693 samples
然后进行归一化
=> estimateSizeFactors()
library(dplyr)
colnames(phenoData(cds)@data)
## 必要的归一化
cds <- estimateSizeFactors(cds)
cds <- estimateDispersions(cds)
然后降维聚类
step1:dispersionTable()
disp_table <- dispersionTable(cds)
unsup_clustering_genes <- subset(disp_table,
mean_expression >= 0.1)
cds <- setOrderingFilter(cds, unsup_clustering_genes$gene_id)
cds
plot_ordering_genes(cds)
# 图中黑色的点就是被标记出来一会要进行聚类的基因
step2:plot_pc_variance_explained()
plot_pc_variance_explained(cds, return_all = F) # norm_method='log'
step3:
关于聚类:monocle2一共有三种方法:densityPeak", "louvain", "DDRTree"
默认使用density peak
的方法,但是对于大型数据(例如有5万细胞)推荐louvain
方法
# ---------进行降维---------
cds <- reduceDimension(cds, max_components = 2, num_dim = 6,
reduction_method = 'tSNE', verbose = T)
# ---------进行聚类---------
# (这里先设置num_clusters,不一定最后真就分成5群;我们这里设置5,最后只能得到4群;如果设成4,结果就得到3群)
cds <- clusterCells(cds, num_clusters = 5)
# Distance cutoff calculated to 1.818667
# color使用的这些数据就在:cds$Cluster
plot_cell_clusters(cds, 1, 2, color = "Cluster")
因为之前还做过层次聚类,所以还可以对比一下:
plot_cell_clusters(cds, 1, 2, color = "g")
看到monocle使用其他的聚类算法确实不如使用自己的结果得到的效果好
再次对比不同分群结果的基因数量差异:
boxplot(cds@phenoData@data$num_genes_expressed~cds@phenoData@data$Cluster)
boxplot(cds@phenoData@data$num_genes_expressed~cds@phenoData@data$g)
去除一些影响因素
因为这几群的细胞中基因表达数量是有差别的,因此我们可以在聚类之前先去掉这部分影响,从而关注它们真正的生物学影响
## 去除检测到基因数量效应
cds <- reduceDimension(cds, max_components = 2, num_dim = 2,
reduction_method = 'tSNE',
residualModelFormulaStr = "~num_genes_expressed",
verbose = T)
cds <- clusterCells(cds, num_clusters = 5)
plot_cell_clusters(cds, 1, 2, color = "Cluster")
差异分析
=> differentialGeneTest()
start=Sys.time()
diff_test_res <- differentialGeneTest(cds,
fullModelFormulaStr = "~Cluster")
end=Sys.time()
end-start
# Time difference of 4.724045 mins
挑差异基因
# 选择FDR-adjusted p-value(也就是q值) < 10%的基因作为差异基因
sig_genes <- subset(diff_test_res, qval < 0.1)
dim(sig_genes)
> head(sig_genes[,c("gene_short_name", "pval", "qval")] )
gene_short_name pval qval
0610007P14Rik 0610007P14Rik 3.377244e-03 1.277429e-02
0610010F05Rik 0610010F05Rik 1.243943e-02 3.924761e-02
0610011F06Rik 0610011F06Rik 2.338530e-03 9.285587e-03
1110004E09Rik 1110004E09Rik 2.487600e-02 7.003903e-02
1110020A21Rik 1110020A21Rik 2.318327e-02 6.618129e-02
1110059E24Rik 1110059E24Rik 5.193533e-09 4.856089e-08
作图
cg=as.character(head(sig_genes$gene_short_name))
# 还能上色
plot_genes_jitter(cds[cg,],
grouping = "Cluster",
color_by = "Cluster",
nrow= 3,
ncol = NULL )
推断发育轨迹
step1: 选合适基因 => setOrderingFilter()
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
cds <- setOrderingFilter(cds, ordering_genes)
plot_ordering_genes(cds)
step2: 降维 => reduceDimension()
# 默认使用DDRTree的方法
cds <- reduceDimension(cds, max_components = 2,
method = 'DDRTree')
step3: 细胞排序 => orderCells()
cds <- orderCells(cds)
最后可视化
plot_cell_trajectory(cds, color_by = "Cluster")