Commit 53c01fdc authored by Anne  BITON's avatar Anne BITON
Browse files

slingshot lineage trajectory

parent 1a25925f
---
title: "E10.5 and E12.5 Fetal Liver samples: Lineage trajectories"
author: "Anne Biton"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
prettydoc::html_pretty:
highlight: github
number_sections: yes
theme: cayman
toc: yes
---
```{r setup, eval=TRUE, echo=FALSE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE}
knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150)
library(data.table)
library(ggplot2)
library(foreach)
library(biomaRt)
library(matrixStats)
library(Seurat)
library(Matrix)
library(scater)
library(scran)
library(pheatmap)
library(dplyr)
library(scales)
library(slingshot)
require(gam)
require(clusterExperiment)
library(EGSEA)
dirdata <- "../../data/"
dirtrajectories <- paste0(dirdata,'derived/FL/trajectories/')
dir.create(dirtrajectories)
```
```{r}
filesce <- paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond.rds')
sceall_seurat <- readRDS(filesce)
```
```{r}
mart <- useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
geneAnnot <- biomaRt::getBM(values = rownames(sceall_seurat@assays$RNA_raw@data),
attributes = c('mgi_symbol', 'mgi_id', 'entrezgene_id','description', 'chromosome_name',
'start_position', 'end_position', 'strand', 'gene_biotype'),
filters = 'mgi_symbol', mart=mart)
geneAnnot$mgi_id <- paste0('http://www.informatics.jax.org/marker/',geneAnnot$mgi_id)
# add missing
geneAnnot <- rbind(geneAnnot, data.frame( mgi_symbol=setdiff(rownames(sceall_seurat@assays$RNA_raw@data), geneAnnot$mgi_symbol),
mgi_id=NA, entrezgene_id=NA, description=NA, chromosome_name=NA, start_position=NA,
end_position=NA, strand=NA, gene_biotype=NA))
```
# Infer lineage trajectories on the merged dataset
## Pseudo-time analysis using `slingshot` (Street et al., 2018)
```{r}
dirslingshot <- paste0(dirtrajectories,'slingshot/')
dir.create(dirslingshot)
```
* We used the clusters computed using random walks on a SNN graph (see script `03.clustering`) to determine the global structure of the lineages. Slingshot identifies the global lineage structure with a cluster-based minimum spanning tree and fits simultaneous principal curves to describe each lineage
* The 'EMP' cluster is used as the starting point.
* Curves are fitted to get smooth lineages: we don't allow the first segment to extend beyond the center of the EMP cluster (option `extend="n"`).
```{r slingshot, fig.width=7, fig.height=6.5, dpi=300}
#add new color scheme
cols_knn_clusters2 <- c('EMP' = "#e6c9ff", 'MP' = "#8a90ff", 'GP' = "#8fc1ff", 'Mac' = "#323e99", 'Mast' = "#f7c868", 'EP' = "#ffc9ba", 'early_Ery' = "#ff8969", 'mid_Ery' = "#fa435f", 'late_Ery' = "#ad0720", 'MkP' = "#fa7602")
cols_knn_clusters2 <- structure(.Data=alpha(cols_knn_clusters2, .6), .Names=names(cols_knn_clusters2))
# trying to fix a cluster as the starting point of the trajectory
lin2 <- slingshot::getLineages(data = as.SingleCellExperiment(sceall_seurat),
reducedDim = 'UMAP',
clusterLabels = as.character(sceall_seurat$clusterIdents),
start.clus= 'EMP',
omega=20)
plot(reducedDims(lin2)$UMAP, col = cols_knn_clusters2[as.character(lin2$clusterIdents)], pch=16, asp = 1, cex=0.5)
lines(SlingshotDataSet(lin2), lwd=2, type = 'lineages', col = 'black', show.constraints = TRUE, cex=1.5)
knitr::kable(cbind(sapply(SlingshotDataSet(lin2)@lineages, paste, collapse= ' - '))) %>% print()
#add new color scheme
cols_pseudotime <- c('pseudotime 1' = "dodgerblue3", 'pseudotime 2' = "violetred2", 'pseudotime 3' = "limegreen", 'pseudotime 4' = "purple", 'pseudotime 5' = "orange")
crv2 <- getCurves(lin2, extend='n')
#pdf("lineages.pdf", 7, 6.5)
plot(reducedDims(lin2)$UMAP, col = cols_knn_clusters2[as.character(lin2$clusterIdents)], pch=16, asp = 1, cex=1, cex.lab=1, cex.axis=1, main = 'extend=n')
lines(SlingshotDataSet(crv2), lwd=4, col= cols_pseudotime)
legend('bottomright', legend = paste('pseudotime', 1:length(SlingshotDataSet(crv2)@lineages)),
col= cols_pseudotime, lty=1, lwd=4, cex=1.25)
#dev.off()
```
### Plot pseudotime versus expression of some genes
```{r}
foreach (gene= c('Sfpi1', 'Gata1')) %do% {
foreach (
indexLin=c(1:5),
labelLin=paste(names(SlingshotDataSet(crv2)@lineages), sapply(SlingshotDataSet(crv2)@lineages, paste0, collapse='-'), sep=': ')[c(1:5)]) %do% {
tibble(pseudotime=crv2[[paste0('slingPseudotime_',indexLin)]],
gene=crv2@assays@data$logcounts[gene,],
cluster=crv2$knn_clusters) %>%
na.omit(.) %>% ggplot(.,aes(x=pseudotime, y=gene, fill=gene)) + geom_point() + geom_smooth() + ggtitle(labelLin) + ylab(gene) +geom_rug(aes(color=cluster),sides = 'b',show.legend = T) + scale_color_manual(values=cols_knn_clusters2[as.character(unique(lin2$knn_clusters))])
}
}
#plot Sfpi1 and Gata1 together by overlay (pseudotime 3)
cols_lin1 <- c('gene' = "dodgerblue4")
cols_lin1 <- structure(.Data=alpha(cols_lin1, .5), .Names=names(cols_lin1))
#pdf("lineages_smooth1.pdf", 9, 5)
tibble(pseudotime=crv2[[paste0('slingPseudotime_',indexLin=3)]],gene=crv2@assays@data$logcounts['Sfpi1',], cluster=crv2$knn_clusters) %>%
na.omit(.) %>%
ggplot(.,aes(x=pseudotime, y=gene, fill=gene)) + geom_point(colour=cols_lin1, size=2.75) + geom_smooth(colour="dodgerblue4", fill="dodgerblue4") + ggtitle('Lineage 3') +geom_rug(aes(color=cluster),sides = 'b',show.legend = T) + scale_color_manual(values=cols_knn_clusters2[as.character(unique(lin2$knn_clusters))])
#dev.off()
cols_lin2 <- c('gene' = "coral2")
cols_lin2 <- structure(.Data=alpha(cols_lin2, .5), .Names=names(cols_lin2))
#pdf("lineages_smooth2.pdf", 9, 5)
tibble(pseudotime=crv2[[paste0('slingPseudotime_',indexLin=3)]],gene=crv2@assays@data$logcounts['Gata1',], cluster=crv2$knn_clusters) %>%
na.omit(.) %>%
ggplot(.,aes(x=pseudotime, y=gene, fill=gene)) + geom_point(colour=cols_lin2, size=2.75) + geom_smooth(colour="coral2", fill="coral2") + ggtitle('Lineage 3') +geom_rug(aes(color=cluster),sides = 'b',show.legend = T) + scale_color_manual(values=cols_knn_clusters2[as.character(unique(lin2$knn_clusters))]) + theme(panel.background = element_rect(fill = "transparent", colour = NA), plot.background = element_rect(fill = "transparent", colour = NA))+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())
#dev.off()
#plot Sfpi1 and Gata1 together by overlay (pseudotime 4)
#pdf("lineages_smooth3.pdf", 9, 5)
tibble(pseudotime=crv2[[paste0('slingPseudotime_',indexLin=4)]],gene=crv2@assays@data$logcounts['Sfpi1',], cluster=crv2$knn_clusters) %>%
na.omit(.) %>%
ggplot(.,aes(x=pseudotime, y=gene, fill=gene)) + geom_point(colour=cols_lin1, size=2.75) + geom_smooth(colour="dodgerblue4", fill="dodgerblue4") + ggtitle('Lineage 4') +geom_rug(aes(color=cluster),sides = 'b',show.legend = T) + scale_color_manual(values=cols_knn_clusters2[as.character(unique(lin2$knn_clusters))])
#dev.off()
#pdf("lineages_smooth4.pdf", 9, 5)
tibble(pseudotime=crv2[[paste0('slingPseudotime_',indexLin=4)]],gene=crv2@assays@data$logcounts['Gata1',], cluster=crv2$knn_clusters) %>%
na.omit(.) %>%
ggplot(.,aes(x=pseudotime, y=gene, fill=gene)) + geom_point(colour=cols_lin2, size=2.75) + geom_smooth(colour="coral2", fill="coral2") + ggtitle('Lineage 4') +geom_rug(aes(color=cluster),sides = 'b',show.legend = T) + scale_color_manual(values=cols_knn_clusters2[as.character(unique(lin2$knn_clusters))]) + theme(panel.background = element_rect(fill = "transparent", colour = NA), plot.background = element_rect(fill = "transparent", colour = NA))+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())
#dev.off()
```
## Get genes associated with pseudotime using the filtered dataset
* The genes whose expression is changing in a continuous manner over pseudotime are extracted by fitting a Generalized Additive Model with a LOESS term for pseudotime.
* The results are in `r paste0(dirslingshot,'FL_slingshot_markersLineages.xlsx')` and a summary of the markers associated with each combination of pseudotime is available in
The results are in `r paste0(dirslingshot,'FL_slingshot_markersLineages_summary.xlsx')`.
* The p-values are adjusted using the Bonferroni method.
* The heatmaps show the top 30 genes (30 smallest p-values) for each lineage.
```{r}
# We would like to find genes whose expression is changing in a continuous manner over pseudotime.
# This can be done by fitting a Generalized Additive Model (GAM) with a LOESS term for pseudotime.
# See “Generalized Additive Models” (Hastie and Tibshirani, 1990).
sceall <- as.SingleCellExperiment(sceall_seurat)
Y <- sceall@assays@data$logcounts
## == for each pseudotime ==
tnames <- paste0('slingPseudotime_',1:length(SlingshotDataSet(crv2)@lineages))
gampvals_perpseudotime <- foreach(tname = tnames) %do% {
# fit a GAM for each gene with a loess term for pseudotime
ps <- crv2[[tname]]
gam.pval <- apply(Y,1,function(z) {
d <- data.frame(z=z, t=ps)
tmp <- suppressWarnings(gam(z ~ lo(t), data=d))
p <- summary(tmp)[4][[1]][1,5]
p
})
topgenes_ps <- names(sort(gam.pval, decreasing = FALSE))[1:30]
names(ps) <- colnames(crv2)
selps <- ps[which(!is.na(ps))]
selps <- names(sort(ps))
pheatmap(t(apply(sceall@assays@data$logcounts[topgenes_ps,selps], 1, function(x) x-mean(x))),
show_rownames = T, show_colnames = F,
annotation_col = data.frame(pseudotime=ps[selps],
cluster=sceall_seurat[,selps]$clusterIdents,
#FACS=sceall_seurat[,selps]$FACS,
id=selps, row.names = 'id') %>%
dplyr::rename(!!tname := pseudotime),
annotation_colors = list(cluster=cols_knn_clusters2[unique(as.character(sceall$clusterIdents))]),
cluster_cols = FALSE, cluster_rows = TRUE, fontsize_row = 9)
return(gam.pval)
}
names(gampvals_perpseudotime) <- tnames
```
```{r write results of gene association with pseudotime}
res <- lapply(gampvals_perpseudotime,
function(x) {
x <- sort(x)
x = data.frame(gene=names(x), pval=x,
adj.pval=p.adjust(x, method='bonferroni'),
geneAnnot[match(names(x), geneAnnot$mgi_symbol),])
x$pval <- signif(x$pval,4)
x$adj.pval <- signif(x$adj.pval,4)
return(x)
})
res %>%
openxlsx::write.xlsx(., file = paste0(dirslingshot,'FL_slingshot_markersLineages.xlsx'), asTable = TRUE, row.names = TRUE)
```
* The number of markers with an adjusted p-value lower than 0.01 per lineage are:
```{r, results='asis'}
do.call(rbind,lapply(res, function(x) sum(x$adj.pval < .01))) %>% knitr::kable() %>% print()
```
```{r function to summarize in which pseudotime a marker is found to be significant}
# for each gene, extract the pseudotimes in which it is detected as a marker
summarizeMarkerSpecificity <- function(markers_sel, adj.pval_cutoff=0.01) {
allmarkergenes <- unique(unlist(lapply(markers_sel, function(x) as.character(x$gene))))
markers_sel <- lapply(markers_sel, function(x) {rownames(x) <- x$gene; x})
ismarker <- mapply(x=markers_sel,
clus=names(markers_sel),
function(x,clus) {
xx <- x[allmarkergenes,c('adj.pval'),drop=F]
rownames(xx) <- allmarkergenes
return(xx)
}, SIMPLIFY = FALSE)
ismarker <- do.call(cbind, ismarker)
colnames(ismarker ) <- names(markers_sel)
ismarker_boolean <- as.matrix(ismarker) < adj.pval_cutoff
ismarker_boolean[is.na(ismarker_boolean)] <- FALSE
ismarker_boolean <- as.data.table(ismarker_boolean)
ismarker_boolean[, nbPseudotimes := rowSums(ismarker_boolean)]
ismarker_boolean[, gene := rownames(ismarker)]
ismarker_boolean <- ismarker_boolean[order(ismarker_boolean$nbPseudotimes),]
ismarker_boolean <- ismarker_boolean[, c("gene", 'nbPseudotimes',setdiff(colnames(ismarker_boolean), c('gene','nbPseudotimes'))), with=FALSE]
ismarker_boolean[, whichPseudotimes := paste0(sort(setdiff(colnames(ismarker_boolean), c('gene','nbPseudotimes'))[which(unlist(.SD) == "TRUE")]), collapse=','), .SDcols = setdiff(colnames(ismarker_boolean), c('gene','nbPseudotimes')), by = gene]
ismarker_info <- cbind(ismarker_boolean[, c('gene','nbPseudotimes', 'whichPseudotimes')], ismarker[match(ismarker_boolean$gene, rownames(ismarker)),])
ismarker_info$whichPseudotimes <- gsub('.FDR', '', ismarker_info$whichPseudotimes)
return(ismarker_info)
}
```
```{r summarize association of markers with pseudotimes}
# read result files and extract significant genes
res_names <-
openxlsx::getSheetNames(paste0(dirslingshot,'FL_slingshot_markersLineages.xlsx'))
res <-
lapply(res_names, function(x) openxlsx::read.xlsx(xlsxFile = paste0(dirslingshot,'FL_slingshot_markersLineages.xlsx'), sheet=x))
names(res) <- res_names
res_sel <- lapply(res, function(x) {
x[x$adj.pval<.01,]
})
# number of genes associated with pseudotime using Bonferroni-adjusted p-value < .01
lapply(res_sel, nrow)
summarizeMarkerSpecificity(markers_sel = res_sel, adj.pval_cutoff = 0.01) %>% cbind(., geneAnnot[match(.$gene,geneAnnot$mgi_symbol),]) %>% openxlsx::write.xlsx(paste0(dirslingshot,'FL_slingshot_markersLineages_summary.xlsx'))
```
```{r perform pathway analysis, eval=FALSE}
# using MSigDB
resora <- foreach(resall = res,
ressel = res_sel,
clusterind = names(res_sel)) %do% {
geneids <- ressel$gene
if (length(geneids) > 0) {
entrezids <- as.character(geneAnnot[match(tolower(geneids), tolower(geneAnnot$mgi_symbol)),]$entrezgene_id)
genes <- geneAnnot[match(tolower(resall$gene),tolower(geneAnnot$mgi_symbol)), c('entrezgene_id', 'mgi_symbol')]
colnames(genes) <- c('FeatureID', 'Symbols')
genes <- na.omit(genes)
gs.annots_ora = buildIdx(entrezIDs = entrezids[!is.na(entrezids)], species = "mouse", gsdb.gsets = "all", kegg.exclude = TRUE)
ora = egsea.ora(geneIDs=entrezids[!is.na(entrezids)],
universe= genes$entrezgene_id, title=clusterind,
gs.annots=gs.annots_ora,
symbolsMap=genes[match(entrezids[!is.na(entrezids)], genes$FeatureID),],
num.threads = 2,
report = F,
report.dir = '')
}
else {
ora <- NA
}
return(ora)
}
names(resora) = names(res_sel)
# write in excel tables
dir.create(paste0(dirslingshot,'/ORA_pathways'))
foreach( comp = names(resora)) %do% {
getres <- mapply(res = resora[[comp]]@results,
degenes = resora[[comp]]@gs.annots,
function(res, degenes) {
res$test.results[[1]] <- subset(res$test.results[[1]], p.adj < .25)
if(nrow(res$test.results[[1]]) > 100) res$test.results[[1]] <- res$test.results[[1]][1:100,]
cbind(res$test.results[[1]],
degenes@anno[match(rownames(res$test.results[[1]]), rownames(degenes@anno)),],
genes=sapply(degenes@idx[match(rownames(res$test.results[[1]]), names(degenes@idx))],
function(x, degenes) paste(geneAnnot$mgi_symbol[match(degenes@featureIDs[x],geneAnnot$entrezgene_id)], collapse=',')
, degenes = degenes))
}, SIMPLIFY = FALSE)
#unlist(lapply(getres, function(x) sum(x$p.adj < .1)))
openxlsx::write.xlsx(getres, file = paste0(dirslingshot,"/ORA_pathways/FL_slingshot_ORA_", gsub(' ','',comp),".xlsx"), row.names=TRUE)
}
```
```{r pathway ORA top 5 using post-filtering dataset, results='asis', eval = FALSE}
topora <- foreach (comp = names(res)) %do% {
reg <- readxl::read_xlsx(paste0(dirslingshot,"/ORA_pathways/", "FL_slingshot_ORA_", gsub(' ','',comp), '.xlsx'),
sheet = 'gsdbreg')
go <- readxl::read_xlsx(paste0(dirslingshot,"/ORA_pathways/", "FL_slingshot_ORA_", gsub(' ','',comp), '.xlsx'),
sheet = 'gsdbgo')
path <- readxl::read_xlsx(paste0(dirslingshot,"/ORA_pathways/", "FL_slingshot_ORA_", gsub(' ','',comp), '.xlsx'),
sheet = 'gsdbpath')
reg <- subset(reg, p.adj < .05)
go <- subset(go, p.adj < .05)
path <- subset(path, p.adj < .05)
cat(paste0('\n\n##### ',comp, ' \n\n') )
#%>% knitr::kable() %>% print()
reg[1:min(5,nrow(reg)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print()
go[1:min(5,nrow(go)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print()
path[1:min(5,nrow(path)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print()
}
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment