Commit 07dafe06 authored by Anne  BITON's avatar Anne BITON
Browse files

merge sequencing runs using fastMNN

parent 6c7b70ba
---
title: "Merge E10.5 and E12.5 Fetal Liver samples from the three different library runs"
author: "Anne Biton"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
prettydoc::html_pretty:
highlight: github
number_sections: yes
theme: cayman
toc: yes
---
```{r setup, eval=TRUE, echo=FALSE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE}
knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150)
library(data.table)
library(ggplot2)
library(foreach)
library(BiocParallel)
library(Biobase)
library(dplyr)
library(tidyr)
library(Seurat)
library(Matrix)
library(Seurat)
library(Matrix)
library(scater)
library(scran)
library(dplyr)
library(scales)
library(batchelor)
dirdata <- "../../data/"
```
# Load data
## Runs 1, 2 and 3 filtered
We load the data obtained after a first filtering step was applied: cells for which mitochondrial genes took up more than 5% of the total UMI counts and that had less than 3,000 detected genes (i.e. less than 3,000 non-zero UMI counts) were filtered out, and genes that were detected in at least 5 cells were conserved. Doublets were removed using `scDblFinder`. See script `FL_01.load.Rmd`
Filtered data can be found in `data/derived/FL/FL_sce.rds`
```{r load sce fl}
sce <- readRDS(paste0(dirdata,'derived/FL/FL_sce.rds'))
```
## Expressed genes
We then select genes detected in at least 5% of the cells of each condition.
We then intersect the genes that are detected across all plates for each condition.
```{r}
sce$run <- paste0('Run',sce$nextseq500_run)
sce_bycond <- SplitObject(as.Seurat(sce), split.by = 'condition')
sce_bycond_filter <- lapply(sce_bycond, function(x) {
x <- CreateSeuratObject(count=x@assays$RNA@counts,
min.cells=round(ncol(x)*5/100),
min.features =3000)
})
intgenes <- Reduce(intersect,(lapply(sce_bycond_filter,rownames)))
intgenes <- intgenes[grep('^ERCC-', intgenes, invert=TRUE)]
length(intgenes)
sce_filter <- lapply(sce_bycond, function(x) x[intgenes,])
sce_filter <- Reduce(merge, sce_filter)
```
`r length(intgenes)` genes are used as the set of "expressed" genes.
# Merging
## Feature selection using quantification of technical noise and performed across batches
The normalization by deconvolution procedure of the `scran` package is used and
`multiBatchNorm` of the `batchelor` package is used to adjust for differences in sequencing depth between batches.
```{r hvgs fetal liver, fig.width=9, fig.height=8}
sce_merge <- lapply(SplitObject(sce_filter, split.by = 'run'), as.SingleCellExperiment)
# normalization by condition using deconvolution
#https://osca.bioconductor.org/normalization.html#normalization-by-deconvolution
clusters <- lapply(sce_merge, quickCluster, min.size = 50, min.mean=0.1)#, use.ranks=FALSE)
sce_merge <- mapply(sce = sce_merge, cluster = clusters, function(sce, cluster) computeSumFactors(sce, clusters=cluster, min.mean=0.1))
sce_merge <- lapply(sce_merge, scater::logNormCounts)
# multiBatchNorm adjusts for differences in sequencing depth between plates by downscaling all batches to match the coverage of the least-sequenced batch.
sce_merge <- do.call(batchelor::multiBatchNorm,sce_merge)
decs <- lapply(sce_merge, function(x) scran::modelGeneVar(x, block=x$condition_plate))#,min.mean=0.05, loess.args=list(span=0.05))) # ,
decs <- lapply(decs, function(x) x[order(x$bio, decreasing=TRUE),])
par(mfrow=c(4,3))
lapply(decs,function(dec.block.416b) {
blocked.stats <- dec.block.416b$per.block
for (i in colnames(blocked.stats)) {
current <- blocked.stats[[i]]
plot(current$mean, current$total, main=i, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(current)
points(curfit$mean, curfit$var, col="red", pch=16)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}}
)
combined.decs <- do.call(combineVar, lapply(decs, function(x) x[intgenes,]))
bio <- combined.decs[order(combined.decs$bio, decreasing = TRUE),'bio']
chosen <- names(bio[1:3000])
sceall <- lapply(sce_merge, function(x) x[chosen,])
sceall <- do.call(cbind,sceall)
# keep all genes and not only the variable genes in the count matrices
sce_allgenes <- do.call(cbind,sce_merge)
```
We selected `r length(chosen)` genes with the largest positive average biological components across all batches.
We use `batchelor::multiBatchPCA` for PCA computation to equalize contributions across batches. We compute it using the selected set of highly variable genes.
```{r multiBatchPCA}
sceall$condition_run <- paste(sceall$condition, sceall$run, sep='_')
sceall$condition_run_plate <- paste(sceall$condition, sceall$run, sceall$plate, sep='_')
pcs <- batchelor::multiBatchPCA(sceall, batch=sceall$condition_run_plate, BPPARAM= BiocParallel::MulticoreParam(3))
names(pcs)
reducedDim(sceall, "multiBatchPCA") <- do.call(rbind,pcs)[colnames(sceall),]
reducedDim(sce_allgenes, "multiBatchPCA") <- do.call(rbind,pcs)[colnames(sce_allgenes),]
sceall_seurat <- as.Seurat(sce_allgenes)
VariableFeatures(sceall_seurat) <- chosen
```
```{r add the data pre-filtering}
sce_merge <- lapply(SplitObject(as.Seurat(sce), split.by = 'run'), as.SingleCellExperiment)
# normalization by condition using deconvolution
#https://osca.bioconductor.org/normalization.html#normalization-by-deconvolution
clusters <- lapply(sce_merge, quickCluster, min.size = 50, min.mean=0.1)#, use.ranks=FALSE)
sce_merge <- mapply(sce = sce_merge, cluster = clusters, function(sce, cluster) computeSumFactors(sce, clusters=cluster, min.mean=0.1))
sce_merge <- lapply(sce_merge, scater::logNormCounts)
# multiBatchNorm adjusts for differences in sequencing depth between plates by downscaling all batches to match the coverage of the least-sequenced batch.
sce_merge <- do.call(batchelor::multiBatchNorm,sce_merge)
sceallbis <- do.call(cbind,sce_merge)
sceall_seurat[['RNA_raw']] <- Seurat::CreateAssayObject(#counts = counts(sceallbis)[, colnames(sceall_seurat)],
data = logcounts(sceallbis)[, colnames(sceall_seurat)])
sceall_seurat[['RNA_raw']]@counts <- counts(sceallbis)[, colnames(sceall_seurat)]
```
## Merging across runs using `fastMNN`
Batch effects were removed by hierarchically merging the samples using the `reduceMNN` function of the batchelor R package on the 50 principal components calculated with the `multiBatchPCA` function. `fastMNN` detects mutual nearest neighbors (MNN) of cells in different batches, and then uses the MNN to correct the values in each PCA subspace.
Merging is done using the **`r length(chosen)`** variable genes selected above among all expressed genes **`r length(intgenes)`** (i.e. detected in at least 5% of cells across all runs and conditions).
Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. Data is saved in `sceall_seurat_fastMNN_bycond.rds`.
```{r merge by condition and then across conditions, cache.rebuild=TRUE}
mnn.e10.5 <- do.call(batchelor::reducedMNN, c(pcs[grep('E10.5_Fetal_Liver',names(pcs))], args = list(k = 20, auto.merge = TRUE)))
mnn.e12.5 <- do.call(batchelor::reducedMNN, c(pcs[grep('E12.5_Fetal_Liver',names(pcs))], args = list(k = 20, auto.merge = TRUE)))
mnn.final <- batchelor::reducedMNN(mnn.e12.5$corrected,
mnn.e10.5$corrected)
# add fastMNN projection/correction as a new DimReduc object
sceall_seurat[['fastMNN']] <- CreateDimReducObject(embeddings = mnn.final$corrected[colnames(sceall_seurat),], key = 'fastMNN')
# save raw count data
#sceall_seurat[['RNA_raw']] <- Seurat::CreateAssayObject(counts = sce_filter@assays$RNA@counts[, colnames(sceall_seurat)])
```
### UMAP plots colored by run, plates, condition and previous clustering
```{r plot merge by condition and then across conditions, fig.width=10, fig.height=7, dpi=300}
sceall_seurat <- RunTSNE(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
sceall_seurat <- RunUMAP(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
#gridExtra::marrangeGrob(list(
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'run', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'plate', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'condition', pt.size = 0.75, dims = c(1,2), label = FALSE)#,
FeaturePlot(object = sceall_seurat , reduction = "umap", features = 'nFeature_RNA', pt.size = 1, dims = c(1,2), label = FALSE)#,
FeaturePlot(object = sceall_seurat , reduction = "umap", features = 'percent.mito', pt.size = 1, dims = c(1,2), label = FALSE)#,
```
### UMAP plots colored by FACS data
```{r plot merge by condition and then across conditions colored by FACS, fig.width=14, fig.height=12}
ggs <- sapply(c("CD41_Ab", "CD61_Ab", "CD71_Ab", "AA4_1_Ab", "CD16_32_Ab", "CD34_Ab", "Mk_gate", "pMac_gate", "CD45_Ab"),
function(i, sceall_seurat) {
sceall_seurat@meta.data[[i]] <- factor(sceall_seurat@meta.data[[i]], levels = c('1','0', 'NA'))
gg <- DimPlot(object = sceall_seurat , reduction = "umap",
#cells.highlight =which(sceall_seurat[[i]] == '1'),
#cols.highlight = 'black',
group.by = i, pt.size = .5, dims = c(1,2), label = FALSE,
cols = c('1'='black', '0'=alpha('grey',.5), 'NA'=alpha('grey',.5))) + ggtitle(i)
return(gg)
}, sceall_seurat , simplify = FALSE)
gridExtra::marrangeGrob(ggs, ncol = 3, nrow=3)
```
```{r mfi plots by condition and then across conditions, fig.width=14, fig.height=9}
ggfs <- sapply(c("FSC_A_MFI", "SSC_A_MFI", "Kit_MFI", "CD41_MFI", "CD61_MFI", "CD71_MFI", "AA4_1_MFI", "CD16_32_MFI", "CD34_MFI", "CD45_MFI"),
function(i, sceall_seurat) {
sceall_seurat@meta.data[[i]] <- as.numeric(sceall_seurat@meta.data[[i]])
gg <- FeaturePlot(object = sceall_seurat , reduction = "umap",
#cells.highlight =which(sceall_seurat[[i]] == '1'),
#cols.highlight = 'black',
features = i, pt.size = .5, min.cutoff="q10", max.cutoff="q95", dims = c(1,2), label = FALSE) + ggtitle(i)
return(gg)
}, sceall_seurat , simplify = FALSE)
gridExtra::marrangeGrob(ggfs, ncol = 4, nrow=3)
# applying biexponential transormation
# from https://haoeric.com/cytometry-transformation-visual-comparation/
#biexp <- function(x, a = 0.5, b = 1, c = 0.5, d = 1, f = 0, w = 0,
# tol = .Machine$double.eps^0.25,
# maxit = as.integer(5000)){
# flowCore:::biexponential_transform(input = x,
# A = a,
# B = b,
# C = c,
# D = d,
# F = f,
# W = w,
# tol = tol,
# maxIt = maxit)
#}
#ggfs <- sapply(c("CD41_MFI", "CD61_MFI", "CD71_MFI", "AA4_1_MFI", "CD16_32_MFI", "CD34_MFI", "CD45_MFI"),
# function(i, sceall_seurat) {
# sceall_seurat@meta.data[[i]] <- biexp(as.numeric(sceall_seurat@meta.data[[i]]))
# gg <- FeaturePlot(object = sceall_seurat , reduction = "umap",
#cells.highlight =which(sceall_seurat[[i]] == '1'),
#cols.highlight = 'black',
# features = i, pt.size = .5, dims = c(1,2), label = FALSE) +
# ggtitle(paste(i, 'biexp trans'))
# return(gg)
# }, sceall_seurat , simplify = FALSE)
#gridExtra::marrangeGrob(ggfs, ncol = 2, nrow=2)
```
### UMAP plots colored with respect to correlation to metacells from the Mouse Cell Atlas project
Test of the package of the Mouse Cell Atlas project providing annotations of cells based on correlation analysis with the MCA dataset
https://github.com/ggjlab/scMCA
Warning: the cell annotation is assigned to the most correlated cell type, as available in a reference matrix provided by scMCA. But we could add a threshold on correlation value or other filters.
We assign 'NA' to the cell types assigned by scMCA that match less than 10 cells (to make the plot a bit more readable).
```{r mca by condition and then across conditions, fig.width=12, fig.height=10, eval=FALSE}
mca_result <- scMCA::scMCA(scdata = sceall_seurat@assays$RNA@data, numbers_plot = 3)
# assignments to top correlations are in
#mca_result$scMCA
mca_result$scMCA <- structure(.Names = names(mca_result$scMCA),
.Data = paste0(mca_result$scMCA, ' (', table(mca_result$scMCA)[mca_result$scMCA], ')'))
sceall_seurat$scMCA <- mca_result$scMCA[colnames(sceall_seurat)]
sceall_seurat$scMCA[grep('\\(1\\)|\\(2\\)|\\(3\\)|\\(4\\)|\\(5\\)|\\(6\\)|\\(7\\)|\\(8\\)|\\(9\\)|\\(10\\)', sceall_seurat$scMCA, perl = FALSE)] <- NA
sceall_seurat$scMCA_score <- apply(mca_result$cors_matrix, 2, max)
```
```{r mca plot by condition and then across conditions, fig.width=10, fig.height=5, eval=FALSE}
DimPlot(object = sceall_seurat , reduction = "umap", group.by = "scMCA", cells = colnames(sceall_seurat)[which(sceall_seurat$scMCA != 'NA')], pt.size = 1, dims = c(1,2))
```
```{r mca score by condition and then across conditions, fig.width=8, fig.height=6, eval=FALSE}
FeaturePlot(object = sceall_seurat , reduction = "umap", features = "scMCA_score", pt.size = 1, dims = c(1,2))
```
```{r, cache.rebuild=TRUE}
saveRDS(sceall_seurat, file = paste0(dirdata,'derived/FL/sceall_seurat_fastMNN_bycond.rds'))
saveRDS(sceall_seurat, file = paste0(dirdata,'derived/FL/sceall_seurat_fastMNN_bycond_precluster.rds'))
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment