Skip to content
Snippets Groups Projects
title: "Merge E10.5 and E12.5 Fetal Liver samples from the three different library runs"
author: "Anne Biton"
date: "`r format(Sys.time(), '%d %B, %Y')`"

output:
  prettydoc::html_pretty:
    highlight: github
    number_sections: yes
    theme: cayman
    toc: yes
knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150)

library(data.table)
library(ggplot2)
library(foreach)
library(BiocParallel)

library(Biobase)
library(dplyr)
library(tidyr)
library(Seurat)
library(Matrix)


library(Seurat)
library(Matrix)
library(scater)
library(scran)
library(dplyr)
library(scales)
library(batchelor)

dirdata <- "../../data/"

Load data

Runs 1, 2 and 3 filtered

We load the data obtained after a first filtering step was applied: cells for which mitochondrial genes took up more than 5% of the total UMI counts and that had less than 3,000 detected genes (i.e. less than 3,000 non-zero UMI counts) were filtered out, and genes that were detected in at least 5 cells were conserved. Doublets were removed using scDblFinder. See script FL_01.load.Rmd

Filtered data can be found in data/derived/FL/FL_sce.rds

sce <- readRDS(paste0(dirdata,'derived/FL/FL_sce.rds'))

Expressed genes

We then select genes detected in at least 5% of the cells of each condition. We then intersect the genes that are detected across all plates for each condition.

sce$run <- paste0('Run',sce$nextseq500_run)

sce_bycond <- SplitObject(as.Seurat(sce), split.by = 'condition')
sce_bycond_filter <- lapply(sce_bycond, function(x) {
                                      x <- CreateSeuratObject(count=x@assays$RNA@counts,
                                                                  min.cells=round(ncol(x)*5/100),
                                                                  min.features =3000)
                                    })

intgenes <- Reduce(intersect,(lapply(sce_bycond_filter,rownames)))
intgenes <- intgenes[grep('^ERCC-', intgenes, invert=TRUE)]
length(intgenes)
sce_filter <- lapply(sce_bycond, function(x) x[intgenes,])

sce_filter <- Reduce(merge, sce_filter)

r length(intgenes) genes are used as the set of "expressed" genes.

Merging

Feature selection using quantification of technical noise and performed across batches

The normalization by deconvolution procedure of the scran package is used and multiBatchNorm of the batchelor package is used to adjust for differences in sequencing depth between batches.

sce_merge <- lapply(SplitObject(sce_filter, split.by = 'run'), as.SingleCellExperiment)


# normalization by condition using deconvolution
#https://osca.bioconductor.org/normalization.html#normalization-by-deconvolution
clusters <- lapply(sce_merge, quickCluster, min.size = 50, min.mean=0.1)#, use.ranks=FALSE)
sce_merge <- mapply(sce = sce_merge, cluster = clusters, function(sce, cluster) computeSumFactors(sce, clusters=cluster, min.mean=0.1))
sce_merge <- lapply(sce_merge, scater::logNormCounts)
# multiBatchNorm adjusts for differences in sequencing depth between plates by downscaling all batches to match the coverage of the least-sequenced batch.
sce_merge <- do.call(batchelor::multiBatchNorm,sce_merge)


decs <- lapply(sce_merge, function(x) scran::modelGeneVar(x, block=x$condition_plate))#,min.mean=0.05, loess.args=list(span=0.05))) # , 
decs <- lapply(decs, function(x) x[order(x$bio, decreasing=TRUE),])

par(mfrow=c(4,3))
lapply(decs,function(dec.block.416b) {
blocked.stats <- dec.block.416b$per.block
for (i in colnames(blocked.stats)) {
    current <- blocked.stats[[i]]
    plot(current$mean, current$total, main=i, pch=16, cex=0.5,
        xlab="Mean of log-expression", ylab="Variance of log-expression")
    curfit <- metadata(current)
    points(curfit$mean, curfit$var, col="red", pch=16)
    curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2) 
}}
)


combined.decs <- do.call(combineVar, lapply(decs, function(x) x[intgenes,]))
bio <- combined.decs[order(combined.decs$bio, decreasing = TRUE),'bio']
chosen <- names(bio[1:3000])

sceall <- lapply(sce_merge, function(x) x[chosen,])
sceall <- do.call(cbind,sceall)

# keep all genes and not only the variable genes in the count matrices
sce_allgenes <- do.call(cbind,sce_merge)

We selected r length(chosen) genes with the largest positive average biological components across all batches.

We use batchelor::multiBatchPCA for PCA computation to equalize contributions across batches. We compute it using the selected set of highly variable genes.

sceall$condition_run <- paste(sceall$condition, sceall$run, sep='_')
sceall$condition_run_plate <- paste(sceall$condition, sceall$run, sceall$plate, sep='_')


pcs <- batchelor::multiBatchPCA(sceall, batch=sceall$condition_run_plate, BPPARAM= BiocParallel::MulticoreParam(3))
names(pcs)

reducedDim(sceall, "multiBatchPCA") <- do.call(rbind,pcs)[colnames(sceall),]
reducedDim(sce_allgenes, "multiBatchPCA") <-  do.call(rbind,pcs)[colnames(sce_allgenes),]
sceall_seurat <- as.Seurat(sce_allgenes)
VariableFeatures(sceall_seurat) <- chosen
sce_merge <- lapply(SplitObject(as.Seurat(sce), split.by = 'run'), as.SingleCellExperiment)

# normalization by condition using deconvolution
#https://osca.bioconductor.org/normalization.html#normalization-by-deconvolution
clusters <- lapply(sce_merge, quickCluster, min.size = 50, min.mean=0.1)#, use.ranks=FALSE)
sce_merge <- mapply(sce = sce_merge, cluster = clusters, function(sce, cluster) computeSumFactors(sce, clusters=cluster, min.mean=0.1))
sce_merge <- lapply(sce_merge, scater::logNormCounts)
# multiBatchNorm adjusts for differences in sequencing depth between plates by downscaling all batches to match the coverage of the least-sequenced batch.
sce_merge <- do.call(batchelor::multiBatchNorm,sce_merge)
sceallbis <- do.call(cbind,sce_merge)

sceall_seurat[['RNA_raw']] <- Seurat::CreateAssayObject(#counts = counts(sceallbis)[, colnames(sceall_seurat)], 
                                                        data = logcounts(sceallbis)[, colnames(sceall_seurat)])
sceall_seurat[['RNA_raw']]@counts <- counts(sceallbis)[, colnames(sceall_seurat)]

Merging across runs using fastMNN

Batch effects were removed by hierarchically merging the samples using the reduceMNN function of the batchelor R package on the 50 principal components calculated with the multiBatchPCA function. fastMNN detects mutual nearest neighbors (MNN) of cells in different batches, and then uses the MNN to correct the values in each PCA subspace.

Merging is done using the r length(chosen) variable genes selected above among all expressed genes r length(intgenes) (i.e. detected in at least 5% of cells across all runs and conditions).

Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. Data is saved in sceall_seurat_fastMNN_bycond.rds.


mnn.e10.5 <- do.call(batchelor::reducedMNN, c(pcs[grep('E10.5_Fetal_Liver',names(pcs))], args = list(k = 20, auto.merge = TRUE)))
mnn.e12.5 <- do.call(batchelor::reducedMNN, c(pcs[grep('E12.5_Fetal_Liver',names(pcs))], args = list(k = 20, auto.merge = TRUE)))
mnn.final <- batchelor::reducedMNN(mnn.e12.5$corrected,
                                   mnn.e10.5$corrected)


# add fastMNN projection/correction as a new DimReduc object
sceall_seurat[['fastMNN']] <- CreateDimReducObject(embeddings = mnn.final$corrected[colnames(sceall_seurat),], key = 'fastMNN')

# save raw count data
#sceall_seurat[['RNA_raw']] <- Seurat::CreateAssayObject(counts = sce_filter@assays$RNA@counts[, colnames(sceall_seurat)])

UMAP plots colored by run, plates, condition and previous clustering

sceall_seurat <- RunTSNE(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
sceall_seurat <- RunUMAP(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)

#gridExtra::marrangeGrob(list(
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'run', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'plate', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'condition', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
FeaturePlot(object = sceall_seurat , reduction = "umap", features =  'nFeature_RNA', pt.size = 1, dims = c(1,2), label = FALSE)#,
FeaturePlot(object = sceall_seurat , reduction = "umap", features =  'percent.mito', pt.size = 1, dims = c(1,2), label = FALSE)#,


UMAP plots colored by FACS data


ggs <- sapply(c("CD41_Ab", "CD61_Ab", "CD71_Ab",  "AA4_1_Ab", "CD16_32_Ab", "CD34_Ab", "Mk_gate", "pMac_gate", "CD45_Ab"), 
              function(i, sceall_seurat) {
                sceall_seurat@meta.data[[i]] <- factor(sceall_seurat@meta.data[[i]], levels = c('1','0', 'NA'))
                gg <- DimPlot(object = sceall_seurat , reduction = "umap", 
                              #cells.highlight =which(sceall_seurat[[i]] == '1'),
                              #cols.highlight = 'black',
                              group.by  = i,  pt.size = .5, dims = c(1,2), label = FALSE, 
                              cols = c('1'='black', '0'=alpha('grey',.5), 'NA'=alpha('grey',.5))) + ggtitle(i)
                return(gg)
              }, sceall_seurat , simplify = FALSE)

gridExtra::marrangeGrob(ggs, ncol = 3, nrow=3)

ggfs <- sapply(c("FSC_A_MFI", "SSC_A_MFI", "Kit_MFI", "CD41_MFI", "CD61_MFI", "CD71_MFI", "AA4_1_MFI", "CD16_32_MFI", "CD34_MFI", "CD45_MFI"), 
              function(i, sceall_seurat) {
                sceall_seurat@meta.data[[i]] <- as.numeric(sceall_seurat@meta.data[[i]])
                gg <- FeaturePlot(object = sceall_seurat , reduction = "umap", 
                              #cells.highlight =which(sceall_seurat[[i]] == '1'),
                              #cols.highlight = 'black',
                              features  = i,  pt.size = .5, min.cutoff="q10", max.cutoff="q95", dims = c(1,2), label = FALSE) + ggtitle(i)
                return(gg)
              }, sceall_seurat , simplify = FALSE)

gridExtra::marrangeGrob(ggfs, ncol = 4, nrow=3)


# applying biexponential transormation
# from https://haoeric.com/cytometry-transformation-visual-comparation/
#biexp <- function(x, a = 0.5, b = 1, c = 0.5, d = 1, f = 0, w = 0, 
#                  tol = .Machine$double.eps^0.25, 
#                  maxit = as.integer(5000)){
    
#    flowCore:::biexponential_transform(input = x, 
#                                       A = a, 
#                                       B = b, 
#                                       C = c, 
#                                       D = d, 
#                                       F = f, 
#                                       W = w, 
#                                       tol = tol, 
#                                       maxIt = maxit)
#}


#ggfs <- sapply(c("CD41_MFI", "CD61_MFI", "CD71_MFI",  "AA4_1_MFI", "CD16_32_MFI", "CD34_MFI", "CD45_MFI"), 
#              function(i, sceall_seurat) {
#                sceall_seurat@meta.data[[i]] <- biexp(as.numeric(sceall_seurat@meta.data[[i]]))
#                gg <- FeaturePlot(object = sceall_seurat , reduction = "umap", 
                              #cells.highlight =which(sceall_seurat[[i]] == '1'),
                              #cols.highlight = 'black',
#                              features  = i,  pt.size = .5, dims = c(1,2), label = FALSE) + 
#                  ggtitle(paste(i, 'biexp trans'))
#                return(gg)
#              }, sceall_seurat , simplify = FALSE)

#gridExtra::marrangeGrob(ggfs, ncol = 2, nrow=2)

UMAP plots colored with respect to correlation to metacells from the Mouse Cell Atlas project

Test of the package of the Mouse Cell Atlas project providing annotations of cells based on correlation analysis with the MCA dataset https://github.com/ggjlab/scMCA

Warning: the cell annotation is assigned to the most correlated cell type, as available in a reference matrix provided by scMCA. But we could add a threshold on correlation value or other filters.

We assign 'NA' to the cell types assigned by scMCA that match less than 10 cells (to make the plot a bit more readable).

mca_result <- scMCA::scMCA(scdata = sceall_seurat@assays$RNA@data, numbers_plot = 3)
# assignments to top correlations are in 
#mca_result$scMCA

mca_result$scMCA <- structure(.Names = names(mca_result$scMCA),
                              .Data = paste0(mca_result$scMCA, ' (', table(mca_result$scMCA)[mca_result$scMCA], ')'))
sceall_seurat$scMCA <- mca_result$scMCA[colnames(sceall_seurat)]

sceall_seurat$scMCA[grep('\\(1\\)|\\(2\\)|\\(3\\)|\\(4\\)|\\(5\\)|\\(6\\)|\\(7\\)|\\(8\\)|\\(9\\)|\\(10\\)', sceall_seurat$scMCA, perl = FALSE)] <- NA
sceall_seurat$scMCA_score <- apply(mca_result$cors_matrix, 2, max)


DimPlot(object = sceall_seurat , reduction = "umap", group.by = "scMCA", cells = colnames(sceall_seurat)[which(sceall_seurat$scMCA != 'NA')], pt.size = 1, dims = c(1,2))

FeaturePlot(object = sceall_seurat , reduction = "umap", features = "scMCA_score", pt.size = 1, dims = c(1,2))

saveRDS(sceall_seurat, file = paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond.rds'))

saveRDS(sceall_seurat, file = paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond_precluster.rds'))