-
Anne BITON authoredAnne BITON authored
- Load data
- Runs 1, 2 and 3 filtered
- Expressed genes
- Merging
- Feature selection using quantification of technical noise and performed across batches
- Merging across runs using fastMNN
- UMAP plots colored by run, plates, condition and previous clustering
- UMAP plots colored by FACS data
- UMAP plots colored with respect to correlation to metacells from the Mouse Cell Atlas project
title: "Merge E10.5 and E12.5 Fetal Liver samples from the three different library runs"
author: "Anne Biton"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
prettydoc::html_pretty:
highlight: github
number_sections: yes
theme: cayman
toc: yes
knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150)
library(data.table)
library(ggplot2)
library(foreach)
library(BiocParallel)
library(Biobase)
library(dplyr)
library(tidyr)
library(Seurat)
library(Matrix)
library(Seurat)
library(Matrix)
library(scater)
library(scran)
library(dplyr)
library(scales)
library(batchelor)
dirdata <- "../../data/"
Load data
Runs 1, 2 and 3 filtered
We load the data obtained after a first filtering step was applied: cells for which mitochondrial genes took up more than 5% of the total UMI counts and that had less than 3,000 detected genes (i.e. less than 3,000 non-zero UMI counts) were filtered out, and genes that were detected in at least 5 cells were conserved. Doublets were removed using scDblFinder
. See script FL_01.load.Rmd
Filtered data can be found in data/derived/FL/FL_sce.rds
sce <- readRDS(paste0(dirdata,'derived/FL/FL_sce.rds'))
Expressed genes
We then select genes detected in at least 5% of the cells of each condition. We then intersect the genes that are detected across all plates for each condition.
sce$run <- paste0('Run',sce$nextseq500_run)
sce_bycond <- SplitObject(as.Seurat(sce), split.by = 'condition')
sce_bycond_filter <- lapply(sce_bycond, function(x) {
x <- CreateSeuratObject(count=x@assays$RNA@counts,
min.cells=round(ncol(x)*5/100),
min.features =3000)
})
intgenes <- Reduce(intersect,(lapply(sce_bycond_filter,rownames)))
intgenes <- intgenes[grep('^ERCC-', intgenes, invert=TRUE)]
length(intgenes)
sce_filter <- lapply(sce_bycond, function(x) x[intgenes,])
sce_filter <- Reduce(merge, sce_filter)
r length(intgenes)
genes are used as the set of "expressed" genes.
Merging
Feature selection using quantification of technical noise and performed across batches
The normalization by deconvolution procedure of the scran
package is used and
multiBatchNorm
of the batchelor
package is used to adjust for differences in sequencing depth between batches.
sce_merge <- lapply(SplitObject(sce_filter, split.by = 'run'), as.SingleCellExperiment)
# normalization by condition using deconvolution
#https://osca.bioconductor.org/normalization.html#normalization-by-deconvolution
clusters <- lapply(sce_merge, quickCluster, min.size = 50, min.mean=0.1)#, use.ranks=FALSE)
sce_merge <- mapply(sce = sce_merge, cluster = clusters, function(sce, cluster) computeSumFactors(sce, clusters=cluster, min.mean=0.1))
sce_merge <- lapply(sce_merge, scater::logNormCounts)
# multiBatchNorm adjusts for differences in sequencing depth between plates by downscaling all batches to match the coverage of the least-sequenced batch.
sce_merge <- do.call(batchelor::multiBatchNorm,sce_merge)
decs <- lapply(sce_merge, function(x) scran::modelGeneVar(x, block=x$condition_plate))#,min.mean=0.05, loess.args=list(span=0.05))) # ,
decs <- lapply(decs, function(x) x[order(x$bio, decreasing=TRUE),])
par(mfrow=c(4,3))
lapply(decs,function(dec.block.416b) {
blocked.stats <- dec.block.416b$per.block
for (i in colnames(blocked.stats)) {
current <- blocked.stats[[i]]
plot(current$mean, current$total, main=i, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(current)
points(curfit$mean, curfit$var, col="red", pch=16)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}}
)
combined.decs <- do.call(combineVar, lapply(decs, function(x) x[intgenes,]))
bio <- combined.decs[order(combined.decs$bio, decreasing = TRUE),'bio']
chosen <- names(bio[1:3000])
sceall <- lapply(sce_merge, function(x) x[chosen,])
sceall <- do.call(cbind,sceall)
# keep all genes and not only the variable genes in the count matrices
sce_allgenes <- do.call(cbind,sce_merge)
We selected r length(chosen)
genes with the largest positive average biological components across all batches.
We use batchelor::multiBatchPCA
for PCA computation to equalize contributions across batches. We compute it using the selected set of highly variable genes.
sceall$condition_run <- paste(sceall$condition, sceall$run, sep='_')
sceall$condition_run_plate <- paste(sceall$condition, sceall$run, sceall$plate, sep='_')
pcs <- batchelor::multiBatchPCA(sceall, batch=sceall$condition_run_plate, BPPARAM= BiocParallel::MulticoreParam(3))
names(pcs)
reducedDim(sceall, "multiBatchPCA") <- do.call(rbind,pcs)[colnames(sceall),]
reducedDim(sce_allgenes, "multiBatchPCA") <- do.call(rbind,pcs)[colnames(sce_allgenes),]
sceall_seurat <- as.Seurat(sce_allgenes)
VariableFeatures(sceall_seurat) <- chosen
sce_merge <- lapply(SplitObject(as.Seurat(sce), split.by = 'run'), as.SingleCellExperiment)
# normalization by condition using deconvolution
#https://osca.bioconductor.org/normalization.html#normalization-by-deconvolution
clusters <- lapply(sce_merge, quickCluster, min.size = 50, min.mean=0.1)#, use.ranks=FALSE)
sce_merge <- mapply(sce = sce_merge, cluster = clusters, function(sce, cluster) computeSumFactors(sce, clusters=cluster, min.mean=0.1))
sce_merge <- lapply(sce_merge, scater::logNormCounts)
# multiBatchNorm adjusts for differences in sequencing depth between plates by downscaling all batches to match the coverage of the least-sequenced batch.
sce_merge <- do.call(batchelor::multiBatchNorm,sce_merge)
sceallbis <- do.call(cbind,sce_merge)
sceall_seurat[['RNA_raw']] <- Seurat::CreateAssayObject(#counts = counts(sceallbis)[, colnames(sceall_seurat)],
data = logcounts(sceallbis)[, colnames(sceall_seurat)])
sceall_seurat[['RNA_raw']]@counts <- counts(sceallbis)[, colnames(sceall_seurat)]
fastMNN
Merging across runs using Batch effects were removed by hierarchically merging the samples using the reduceMNN
function of the batchelor R package on the 50 principal components calculated with the multiBatchPCA
function. fastMNN
detects mutual nearest neighbors (MNN) of cells in different batches, and then uses the MNN to correct the values in each PCA subspace.
Merging is done using the r length(chosen)
variable genes selected above among all expressed genes r length(intgenes)
(i.e. detected in at least 5% of cells across all runs and conditions).
Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. Data is saved in sceall_seurat_fastMNN_bycond.rds
.
mnn.e10.5 <- do.call(batchelor::reducedMNN, c(pcs[grep('E10.5_Fetal_Liver',names(pcs))], args = list(k = 20, auto.merge = TRUE)))
mnn.e12.5 <- do.call(batchelor::reducedMNN, c(pcs[grep('E12.5_Fetal_Liver',names(pcs))], args = list(k = 20, auto.merge = TRUE)))
mnn.final <- batchelor::reducedMNN(mnn.e12.5$corrected,
mnn.e10.5$corrected)
# add fastMNN projection/correction as a new DimReduc object
sceall_seurat[['fastMNN']] <- CreateDimReducObject(embeddings = mnn.final$corrected[colnames(sceall_seurat),], key = 'fastMNN')
# save raw count data
#sceall_seurat[['RNA_raw']] <- Seurat::CreateAssayObject(counts = sce_filter@assays$RNA@counts[, colnames(sceall_seurat)])
UMAP plots colored by run, plates, condition and previous clustering
sceall_seurat <- RunTSNE(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
sceall_seurat <- RunUMAP(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
#gridExtra::marrangeGrob(list(
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'run', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'plate', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'condition', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
FeaturePlot(object = sceall_seurat , reduction = "umap", features = 'nFeature_RNA', pt.size = 1, dims = c(1,2), label = FALSE)#,
FeaturePlot(object = sceall_seurat , reduction = "umap", features = 'percent.mito', pt.size = 1, dims = c(1,2), label = FALSE)#,
UMAP plots colored by FACS data
ggs <- sapply(c("CD41_Ab", "CD61_Ab", "CD71_Ab", "AA4_1_Ab", "CD16_32_Ab", "CD34_Ab", "Mk_gate", "pMac_gate", "CD45_Ab"),
function(i, sceall_seurat) {
sceall_seurat@meta.data[[i]] <- factor(sceall_seurat@meta.data[[i]], levels = c('1','0', 'NA'))
gg <- DimPlot(object = sceall_seurat , reduction = "umap",
#cells.highlight =which(sceall_seurat[[i]] == '1'),
#cols.highlight = 'black',
group.by = i, pt.size = .5, dims = c(1,2), label = FALSE,
cols = c('1'='black', '0'=alpha('grey',.5), 'NA'=alpha('grey',.5))) + ggtitle(i)
return(gg)
}, sceall_seurat , simplify = FALSE)
gridExtra::marrangeGrob(ggs, ncol = 3, nrow=3)
ggfs <- sapply(c("FSC_A_MFI", "SSC_A_MFI", "Kit_MFI", "CD41_MFI", "CD61_MFI", "CD71_MFI", "AA4_1_MFI", "CD16_32_MFI", "CD34_MFI", "CD45_MFI"),
function(i, sceall_seurat) {
sceall_seurat@meta.data[[i]] <- as.numeric(sceall_seurat@meta.data[[i]])
gg <- FeaturePlot(object = sceall_seurat , reduction = "umap",
#cells.highlight =which(sceall_seurat[[i]] == '1'),
#cols.highlight = 'black',
features = i, pt.size = .5, min.cutoff="q10", max.cutoff="q95", dims = c(1,2), label = FALSE) + ggtitle(i)
return(gg)
}, sceall_seurat , simplify = FALSE)
gridExtra::marrangeGrob(ggfs, ncol = 4, nrow=3)
# applying biexponential transormation
# from https://haoeric.com/cytometry-transformation-visual-comparation/
#biexp <- function(x, a = 0.5, b = 1, c = 0.5, d = 1, f = 0, w = 0,
# tol = .Machine$double.eps^0.25,
# maxit = as.integer(5000)){
# flowCore:::biexponential_transform(input = x,
# A = a,
# B = b,
# C = c,
# D = d,
# F = f,
# W = w,
# tol = tol,
# maxIt = maxit)
#}
#ggfs <- sapply(c("CD41_MFI", "CD61_MFI", "CD71_MFI", "AA4_1_MFI", "CD16_32_MFI", "CD34_MFI", "CD45_MFI"),
# function(i, sceall_seurat) {
# sceall_seurat@meta.data[[i]] <- biexp(as.numeric(sceall_seurat@meta.data[[i]]))
# gg <- FeaturePlot(object = sceall_seurat , reduction = "umap",
#cells.highlight =which(sceall_seurat[[i]] == '1'),
#cols.highlight = 'black',
# features = i, pt.size = .5, dims = c(1,2), label = FALSE) +
# ggtitle(paste(i, 'biexp trans'))
# return(gg)
# }, sceall_seurat , simplify = FALSE)
#gridExtra::marrangeGrob(ggfs, ncol = 2, nrow=2)
UMAP plots colored with respect to correlation to metacells from the Mouse Cell Atlas project
Test of the package of the Mouse Cell Atlas project providing annotations of cells based on correlation analysis with the MCA dataset https://github.com/ggjlab/scMCA
Warning: the cell annotation is assigned to the most correlated cell type, as available in a reference matrix provided by scMCA. But we could add a threshold on correlation value or other filters.
We assign 'NA' to the cell types assigned by scMCA that match less than 10 cells (to make the plot a bit more readable).
mca_result <- scMCA::scMCA(scdata = sceall_seurat@assays$RNA@data, numbers_plot = 3)
# assignments to top correlations are in
#mca_result$scMCA
mca_result$scMCA <- structure(.Names = names(mca_result$scMCA),
.Data = paste0(mca_result$scMCA, ' (', table(mca_result$scMCA)[mca_result$scMCA], ')'))
sceall_seurat$scMCA <- mca_result$scMCA[colnames(sceall_seurat)]
sceall_seurat$scMCA[grep('\\(1\\)|\\(2\\)|\\(3\\)|\\(4\\)|\\(5\\)|\\(6\\)|\\(7\\)|\\(8\\)|\\(9\\)|\\(10\\)', sceall_seurat$scMCA, perl = FALSE)] <- NA
sceall_seurat$scMCA_score <- apply(mca_result$cors_matrix, 2, max)
DimPlot(object = sceall_seurat , reduction = "umap", group.by = "scMCA", cells = colnames(sceall_seurat)[which(sceall_seurat$scMCA != 'NA')], pt.size = 1, dims = c(1,2))
FeaturePlot(object = sceall_seurat , reduction = "umap", features = "scMCA_score", pt.size = 1, dims = c(1,2))
saveRDS(sceall_seurat, file = paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond.rds'))
saveRDS(sceall_seurat, file = paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond_precluster.rds'))