Commit 6c7b70ba authored by Anne  BITON's avatar Anne BITON
Browse files

load and filter UMI counts

parent b722a903
---
title: "Load Liver samples at time points E10.5 and E12.5 from the three different library runs"
author: "Anne Biton"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
prettydoc::html_pretty:
highlight: github
number_sections: yes
theme: cayman
toc: yes
---
```{r setup, eval=TRUE, echo=FALSE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE}
knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150)
library(data.table)
library(ggplot2)
library(foreach)
library(biomaRt)
library(Biobase)
library(dplyr)
library(tidyr)
library(Seurat)
library(Matrix)
library(SingleCellExperiment)
dirdata <- "../../data/"
```
# Load data
## Runs 1, 2 and 3
Libraries are available in `scMCE/data/raw/libraryRun123/umi.tab.fl/`
Annotation of the plates and cells is available in `scMCE/data/raw/libraryRun123/Cells_Run123_FL.xlsx`
```{r}
gse <- GEOquery::getGEO('GSE166223',GSEMatrix=TRUE)
#sampleNames(gse$GSE166223_series_matrix.txt.gz@phenoData)
# download annotation file
download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE166nnn/GSE166223/suppl/GSE166223_Samples.xlsx", destfile = './temp.xlsx')
annot_fl <- readxl::read_xlsx("./temp.xlsx")
annot_fl <- dplyr::filter(annot_fl, Tissue == 'Fetal Liver')
colnames(annot_fl) <- gsub(' ', '_', tolower(colnames(annot_fl)))
# download UMI counts
countfiles_fl <- as.character(pData(gse$GSE166223_series_matrix.txt.gz)[pData(gse$GSE166223_series_matrix.txt.gz)$"tissue:ch1" == 'fetal liver',]$supplementary_file_1)
umicounts <- lapply(countfiles_fl, data.table::fread)
names(umicounts) <- countfiles_fl
umicounts <- lapply(umicounts, function(x) {
# sometimes column names have to be shifted by one
cn <- names(x)
if (cn[1] == 'V1') {
setnames(x, 'V1', 'gene')
} else {
setnames(x,c('gene', cn[-length(cn)]))
}
x
})
# get genes available in all plates
allgenes <- Reduce(intersect,lapply(umicounts, function(x) x$gene))
umicounts <- lapply(umicounts, function(x) x[match(allgenes,gene),])
umicounts <- lapply(umicounts, function(x) x[,-1,with=F])
umicounts <- do.call(cbind,umicounts)
umicounts <- cbind(data.table(gene=allgenes,umicounts))
# zero counts were assigned to the few cells that came from the Yolk Sac for mixed plates from the sort (Plate 15 Run2), so we remove them now.
umicounts <- umicounts[,which(colSums(umicounts[,-1]) > 0), with=FALSE]
```
```{r}
#annot_fl <- readxl::read_xlsx(paste0(dirdata, 'raw/libraryRun123/Cells_Run123_FL.xlsx'))
annot_fl <- annot_fl %>% mutate(condition = gsub(' ', '_', annot_fl$condition))
annot_fl <- annot_fl %>% mutate(tissue = gsub(' ', '_', annot_fl$tissue))
annot_fl <- annot_fl %>% mutate(plate = gsub('.$', '', annot_fl$halfplate))
annot_fl <- annot_fl %>% mutate(condition_plate = paste(condition,plate, sep='_'))
annot_fl <- annot_fl %>% mutate(condition_halfplate = paste(condition,halfplate, sep='_'))
annot <- annot_fl[match(gsub('.txt.gz.P.*', '', gsub(pattern = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5066nnn/GSM.*/suppl/GSM.*_E", replacement='E', colnames(umicounts)[-1])),gsub('.txt', '', annot_fl$filename)),] #annot_fl[match(gsub('\\.P.*', '', colnames(umicounts))[-1],gsub('.txt', '', annot_fl$filename)),]
annot <- annot %>% mutate(cellID=colnames(umicounts)[-1])
annot <- annot %>% mutate(run = nextseq500_run)
colnames(umicounts) <- gsub('.txt.gz', '', gsub(pattern = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5066nnn/GSM.*/suppl/GSM.*_E", replacement='E', colnames(umicounts)))
annot$cellID <- gsub('.txt.gz', '', gsub(pattern = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5066nnn/GSM.*/suppl/GSM.*_E", replacement='E', as.character(annot$cellID)))
```
```{r load FACS}
# facsfile <- paste0(dirdata, 'raw/libraryRun123/FACS/IndexSort_FL_E10.5_E12.5.xlsx')
# sheets <- readxl::excel_sheets(facsfile)
# facs <- lapply(sheets, function(x) readxl::read_xlsx(facsfile, sheet = x))
# names(facs) <- sheets
# names(facs) <- gsub('FL ', '_Fetal_Liver_', names(facs))
# names(facs) <- gsub(' ', '_', names(facs))
#
# facs <- do.call(rbind, mapply(x=facs, n=names(facs), function(x,n) {
# x$well_id <- paste0(n,'.',x$well_id);
# x
# }, SIMPLIFY=F))
#saveRDS(facs, file = '../data/raw/FL_facs.rds')
facs <- readRDS(paste0(dirdata,'raw/FL_facs.rds'))
```
We loaded UMI counts data coming from `r length(unique(annot$run))` library/sequencing runs and `r length(unique(annot$filename))` halfplates, for `r length(unique(annot$condition))` conditions.
The distribution of plates, conditions, and genotypes across runs is:
`r table(annot$condition_plate, annot$nextseq500_run, useNA='ifany') %>% knitr::kable()`.
# Number of cells, UMIs, detected genes, ...
```{r nb reads and zeros, cache=FALSE}
nbUMIs <- colSums(umicounts[,-1])
non0genes <- colSums(umicounts[,-1] > 0)
only0genes <- colSums(umicounts[,-1] == 0)
only0cells <- colSums(umicounts[,-1] == 0)
nb0genes <- colSums(umicounts[,-1] == 0)
prop0genes <- colSums(umicounts[,-1] == 0)/nrow(umicounts)
nbnot0genes <- colSums(umicounts[,-1] > 0)
```
## Number of detected genes across cells for each half plate
```{r, fig.width=10, fig.height=5}
dt0g <- data.table(nbZero = c(t(as.data.table(non0genes))),
filename = gsub('\\.P.*', '', names(non0genes)),
run=as.factor(annot$nextseq500_run),
condition = annot$condition)
dt0g <- dt0g[order(dt0g$run,dt0g$filename, dt0g$nbZero),]
gg <- ggplot(dt0g,
aes(x = filename, y = nbZero, fill=run)) +
geom_bar(stat='identity',
position=position_dodge2(1),
size=.3,
width = 0.5) +
ylab( '#detected genes') +
xlab('half plate') +
theme_bw( ) +
theme( axis.text.x = element_text(angle = 90, size = 10, vjust = 0.5, hjust = 1),
axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.text.x.top = element_text(size = 10))
print(gg)
```
## Number of UMIs across cells for each half plate
```{r plot nb umis}
data.table(nbUMIs = c(t(as.data.table(nbUMIs))),
filename = gsub('\\.P.*', '', names(nbUMIs)),
run=as.factor(annot$nextseq500_run),
condition = annot$condition) %>%
.[order(run,filename,nbUMIs)] %>%
ggplot(.,aes(x = filename, y = nbUMIs, fill=run)) +
geom_bar(stat='identity',
position=position_dodge2(1),
size=.3,
width = 0.5) +
ylab( 'total UMI counts') +
xlab('half plate') +
theme_bw( ) +
theme( axis.text.x = element_text(angle = 90, size = 10, vjust = 0.5, hjust = 1),
axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.text.x.top = element_text(size = 10))
```
## Fraction of mitochondrial genes across cells for each half plate
```{r plot mito, fig.height=5}
mito <- grep(pattern = "^mt-", x = umicounts$gene, value = TRUE, ignore.case = TRUE)
propmito <- colSums(umicounts[umicounts$gene %in% mito,-1])/colSums(umicounts[,-1])
data.table(nbUMIs = c(t(as.data.table(propmito))),
filename = gsub('\\.P.*', '', names(propmito)),
run = as.factor(annot$nextseq500_run)) %>%
.[order(propmito,filename),] %>%
ggplot(.,
aes(x = filename, y = nbUMIs, fill=run)) +
geom_bar(stat='identity',
position=position_dodge2(1),
size=.3,
width = 0.5) +
ylab( 'fraction of mitochondrial genes') +
xlab('half plate') +
theme_bw( ) +
theme( axis.text.x = element_text(angle = 90, size = 10, vjust = 0.5, hjust = 1),
axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.text.x.top = element_text(size = 10))
```
## Gene with largest UMI count per cell
This is to check whether there are genes that are predominant in a large number of cells.
```{r max gene, results='asis', cache.lazy=FALSE}
# remove cells with zero counts to avoid dividing by zero
umicountssel <- umicounts[grep('ERCC-', umicounts$gene, invert = TRUE),-1]
umicountssel <- umicountssel[,which(colSums(umicountssel)>0), with=F]
frac = scale(umicountssel, scale = colSums(umicountssel), center=FALSE)
frac <- as.data.table(frac)
frac <- frac[, gene := umicounts$gene[grep('ERCC-', umicounts$gene, invert = TRUE)]]
fracm <- melt(frac)
setnames(fracm, c('gene', 'cellid', 'frac'))
rm(frac)
# max gene per cell
maxfrac <- fracm[, list(genemax=gene[which.max(frac)], fracmax=frac[which.max(frac)]), by=c('cellid')]
maxfrac[, run := paste0('run',as.character(annot[match(maxfrac$cellid, annot$cellID),]$run))]
maxfrac[, plate := annot[match(maxfrac$cellid, annot$cellID),]$condition_plate]
maxfrac[, nbOcc := length(cellid), by = 'genemax']
maxfrac <- maxfrac[order(nbOcc, decreasing = TRUE),]
maxfrac_all <- maxfrac[, list(nbOcc = length(cellid)), by = 'genemax']
head(maxfrac_all, 20) %>% knitr::kable() %>% print()
```
There are `r sum(table(maxfrac$genemax)>=50)` genes that are predominant in at least 50 cells: `r names(sort(table(maxfrac$genemax), decreasing=TRUE)[sort(table(maxfrac$genemax), decreasing=TRUE)>=50])`.
## Seurat violin plots
A Seurat object containing the unfiltered UMI counts is saved in the RData file `data/derived/FL_umis.rds`.
```{r, fig.height=7, fig.width=13}
rownames(annot) <- annot$cellID
umis <- CreateSeuratObject(counts = Matrix(as.matrix(umicounts[,-1]), dimnames = list(umicounts$gene, colnames(umicounts)[-1])),
meta.data = as.data.frame(annot))
mito <- grep(pattern = "^mt-", x = umicounts$gene, value = TRUE, ignore.case = TRUE)
propmito <- colSums(umicounts[umicounts$gene %in% mito,-1])/colSums(umicounts[,-1])
umis[['percent.mito']] <- propmito
umis$condition_plate <- paste(umis$condition, umis$plate, sep='_')
umis$condition_replicate_plate <- paste(umis$condition, umis$replicate, umis$plate, sep='_')
umis$condition_halfplate <- factor(umis$condition_halfplate, levels = unique(umis$condition_halfplate[order(umis$nextseq500_run)]))
VlnPlot(object = umis, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"),
ncol = 3, group.by = 'condition_halfplate', pt.size = .05,
cols = annot_fl$nextseq500_run[match(levels(umis$condition_halfplate), annot_fl$condition_halfplate)]+1)
dir.create(paste0(dirdata,'derived/FL/'))
save(umis, file=paste0(dirdata,'derived/FL/FL_umis.rds'))
```
```{r, fig.height=5, fig.width=12}
#select condition colors
cols_condition <- c("#e6dc85", "#97d2db")
cols_condition <- structure(.Data=alpha(cols_condition, .5))
#order
#vln_levels <- c("E10.5_Fetal_Liver", "E12.5_Fetal_Liver")
#umis$condition <- factor(x=umis$condition, levels = vln_levels)
VlnPlot(object = umis, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"),
ncol = 3, group.by = 'condition', pt.size = .05, cols = cols_condition)
```
## Histogram of number of detected genes
blue line = 2,000 cutoff
pink line = 2,750 cutoff
red line = 3,000 cutoff
```{r hist and scatter, fig.height=10, fig.width=10}
par(mfrow=c(5,5))
for (cp in unique(umis$condition_halfplate)) {
hist(umis$nFeature_RNA[umis$condition_halfplate == cp],
main = cp, xlab='#detected genes', breaks=30)
abline(v=2750, col = 'pink', lty = 2)
abline(v=3000, col = 'red', lty = 2)
abline(v=2000, col = 'blue', lty = 2)
#umis@meta.data[, c('nFeature_RNA', 'condition_halfplate')] %>% ggplot(., aes(x=nFeature_RNA)) + geom_histogram(binwidth = 0.5) + facet_wrap(~condition_halfplate)
}
```
## Number of UMIs versus number of detected genes
blue line = 2,000 cutoff
pink line = 2,750 cutoff
red line = 3,000 cutoff
```{r scatter, fig.height=10, fig.width=10}
par(mfrow=c(5,5))
for (cp in unique(umis$condition_halfplate)) {
smoothScatter(x=umis$nFeature_RNA[umis$condition_halfplate == cp],
y=umis$nCount_RNA[umis$condition_halfplate == cp],
main = cp, xlab='#detected genes', ylab='#UMIs')#,
# xlim = c(0,11000), ylim = c(0, 30000))
abline(v=3000, col = 'red', lty = 2)
abline(v=2750, col = 'pink', lty = 2)
abline(v=2000, col = 'blue', lty = 2)
}
```
```{r scatter by condition, fig.height=3, fig.width=6}
#by condition
par(mfrow=c(1,2))
for (cp in unique(umis$condition)) {
smoothScatter(x=umis$nFeature_RNA[umis$condition == cp],
y=umis$nCount_RNA[umis$condition == cp],
main = cp, xlab='#detected genes', ylab='#UMIs')
abline(v=3000, col = 'red', lty = 2)
}
```
## Number of cells with at least 2,000, 2,750 or 3,000 detected genes
```{r, fig.width=8}
dt0g <- data.table(nbZero = c(t(as.data.table(non0genes))),
filename = gsub('\\.P.*', '', names(non0genes)),
condition = annot$condition)
dt0g <- dt0g[order(dt0g$nbZero,dt0g$filename),]
dt0g[, nbCells:=sum(nbZero > 2000), by = filename] %>%
ggplot(.,
aes(x = filename, y = nbCells)) +
geom_bar(stat='sum',
position=position_dodge2(1),
size=.3,
colour="darkblue", fill="darkblue",
width = 0.5) +
ylab( '# cells with >2,000 detected genes') +
xlab('halfplate') +
theme_bw( ) +
theme( axis.text.x = element_text(angle = 90, size = 10, vjust = 0.5, hjust = 1),
axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.text.x.top = element_text(size = 10))
dt0g[, nbCells:=sum(nbZero > 3000), by = filename] %>%
ggplot(.,
aes(x = filename, y = nbCells)) +
geom_bar(stat='sum',
position=position_dodge2(1),
size=.3,
colour="darkred", fill="darkred",
width = 0.5) +
ylab( '# cells with >3,000 detected genes') +
xlab('halfplate') +
theme_bw( ) +
theme( axis.text.x = element_text(angle = 90, size = 10, vjust = 0.5, hjust = 1),
axis.title.y = element_text(size = 10),
axis.text.y = element_text(size = 10),
axis.text.x.top = element_text(size = 10))
```
```{r, results=TRUE}
dt0g[, list(nbCells_atLeast2000genes =sum(nbZero > 2000),
nbCells_atLeast2750genes =sum(nbZero > 2750),
nbCells_atLeast3000genes =sum(nbZero > 3000)), by = filename] %>% knitr::kable() %>% print()
dt0g[, list(nbCells_atLeast2000genes =sum(nbZero > 2000),
nbCells_atLeast2750genes =sum(nbZero > 2750),
nbCells_atLeast3000genes =sum(nbZero > 3000)), by = condition] %>% knitr::kable() %>% print()
```
# Cell and gene filtering
## Filtering out cells with less than 3,000 detected genes for all plates
As a first selection, we filter out cells for which mitochondrial genes takes up more than 5% of the total UMI counts, and that have less than 3,000 detected genes (i.e. less than 3,000 non-zero UMI counts). We keep genes that are detected in at least 5 cells. We remove ERCC spike-in RNAs. Then we exclude doublets that are identified using the package `scDblFinder`.
```{r}
sceall <- subset(umis, subset = percent.mito < 0.05 & nFeature_RNA > 3000)
# add meta data
sceall@meta.data <- umis[,colnames(sceall)]@meta.data
# filter out genes detected in less than 5 cells
# and remove spike-ins which are not usable anyway given their very low counts
sceall <- CreateSeuratObject(count=sceall@assays$RNA@counts[grep('^ERCC-', rownames(sceall), invert = TRUE),],
min.cells=5, meta.data = sceall@meta.data)
sce <- as.SingleCellExperiment(sceall) #(assays=list(counts=umisRun2@assays$RNA@counts))
```
`r ncol(sceall)` cells and `r nrow(sceall)` genes were selected .
## Doublet detection
Doublet detection is done using the package `scDblFinder` using default parameters and running the analysis for each plate.
see https://www.biorxiv.org/content/10.1101/2020.02.02.930578v1.full.pdf,.
```{r doublet, eval=FALSE}
library(scDblFinder)
sce$groupForDoubletDetection <- sce$condition_plate
sce$groupForDoubletDetection[sce$condition_plate %in% c("E10.5_Fetal_Liver_10", "E10.5_Fetal_Liver_14","E10.5_Fetal_Liver_15")] <- 'E10.5_Fetal_Liver_plate101415'
sce <- scDblFinder(sce, samples=sce$groupForDoubletDetection, minClusSize = 50)#, BPPARAM=MulticoreParam(3))
# how many cells are detected to be doublets ?
table(sce$scDblFinder.class)
```
<!-- Distribution of cells detected as doublets across plates by `scDblFinder`: `r #table(sce$scDblFinder.class, sce$condition_halfplate) %>% knitr::kable()`. -->
<!-- We filter out `r sum(sce$scDblFinder.class == 'doublet')` cells detected as being doublets by `scDblFinder`. -->
```{r remove doublets, eval=FALSE}
sceall <- sceall[, sce$scDblFinder.class != 'doublet']
sce <- sce[, sce$scDblFinder.class != 'doublet']
```
Since we can't reproduce exactly the same selection of cells as the on in the paper (scDblFinder v1.1.8 doublet selection seems to vary a bit across runs of the same function). We saved the IDs of the selected cells in data/derived/YS and are loading them now.
We also discard plate 15 that only contains less than 10 cells passing the quality thresholds.
```{r}
selcells <- readRDS(file =paste0(dirdata,'derived/FL/FL_selcells.rds'))
sce <- sce[,selcells]
sceall <- sceall[,selcells]
```
## Number of cells after this first filtering step
**The number of filtered cells per plate :** `r table(sceall$condition_replicate_plate) %>% knitr::kable()`
**The number of filtered cells per halfplate :** `r table(sceall$filename) %>% knitr::kable()`
**The number of filtered cells per condition :** `r table(sceall$condition) %>% knitr::kable()`
```{r, cache.rebuild=TRUE}
colData(sce) <- cbind(colData(sce), facs[match(paste0(sce$condition, '_Run', sce$nextseq500_run, '_P', sce$plate, '.', gsub('.*\\.', '', sce$cellID)), facs$well_id),])
dir.create(paste0(dirdata,"/derived/FL/"))
saveRDS(sce, file=paste0(dirdata,'/derived/FL/FL_sce.rds'))
```
The UMI counts of the data remaining after this first filtering step are available in `data/derived/FL_E10.5_E12.5_library123/FL_sce.rds`.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment