diff --git a/src/FL/FL_03.clustering.Rmd b/src/FL/FL_03.clustering.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..74eaec6c8a47db57260c712b69facf55587748db --- /dev/null +++ b/src/FL/FL_03.clustering.Rmd @@ -0,0 +1,1009 @@ +--- +title: "Cluster E10.5 and E12.5 Fetal Liver samples" +author: "Anne Biton" +date: "`r format(Sys.time(), '%d %B, %Y')`" + +output: + prettydoc::html_pretty: + highlight: github + number_sections: yes + theme: cayman + toc: yes +--- + + + + +```{r setup, eval=TRUE, echo=FALSE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE} +knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150) + +library(renv) +library(data.table) +library(ggplot2) +library(foreach) +library(biomaRt) +library(BiocParallel) +library(clusterExperiment) +library(matrixStats) +library(Seurat) +library(Matrix) +library(scater) +library(scran) +library(dplyr) +library(scales) +library(tidyverse) +library(SingleCellExperiment) +library(igraph) +library(randomcoloR) +library(batchelor) +library(openxlsx) +library(pheatmap) + +dirdata <- "../../data/" +``` + + +# Load data + +## Merging first by condition then across conditions + +Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. + + + +# UMAP plots of the merged dataset + +Merging is done using the 3,000 variable features selected above. +See script `FL_02.merge.Rmd` +Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. Data available in `data/derived/FL/FL_sceall_seurat_fastMNN_bycond.rds`. + + + +```{r} +filesce <- paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond.rds') +sceall_seurat <- readRDS(filesce) +``` + +```{r load FL run123 progenitors and colors} + +# load progenitors from run 123 FL (CMP/GMP/MEP/P1) +cell2progenitor <- readRDS(paste0(dirdata,'derived/FL/FL_cell2progenitor_run123.rds')) +sceall_seurat$progenitors <- cell2progenitor[colnames(sceall_seurat)] + + +cols_progenitors <- c('CMP' = "#d570fa", 'GMP' = "#4e9dfc", 'MEP' = "#ed5a32", 'P1' = "#eda532", 'NA' = "#ffffff") +cols_progenitors <- structure(.Data=alpha(cols_progenitors, .6), .Names=names(cols_progenitors)) + + +``` + + +```{r} +mart <- useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl") +geneAnnot <- biomaRt::getBM(values = rownames(sceall_seurat@assays$RNA_raw@data), + attributes = c('mgi_symbol', 'mgi_id', 'entrezgene_id','description', 'chromosome_name', + 'start_position', 'end_position', 'strand', 'gene_biotype'), + filters = 'mgi_symbol', mart=mart) +geneAnnot$mgi_id <- paste0('http://www.informatics.jax.org/marker/',geneAnnot$mgi_id) +# add missing +geneAnnot <- rbind(geneAnnot, data.frame( mgi_symbol=setdiff(rownames(sceall_seurat@assays$RNA_raw@data), geneAnnot$mgi_symbol), + mgi_id=NA, entrezgene_id=NA, description=NA, chromosome_name=NA, start_position=NA, + end_position=NA, strand=NA, gene_biotype=NA)) + +``` + + +```{r} +sceall_seurat <- RunTSNE(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE) +sceall_seurat <- RunUMAP(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE) +``` + + + +```{r, fig.width=10, fig.height=7, dpi=300} + +#'E9.5_Yolk_Sac' = "lightpink", 'E10.5_Yolk_Sac' = "goldenrod1", +cols_condition <- c("#e6dc85", "#97d2db") +cols_condition <- structure(.Data=alpha(cols_condition, .3), + .Names=c('E10.5_Fetal_Liver', 'E12.5_Fetal_Liver')) + + +DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'run', pt.size = 2, dims = c(1,2), label = FALSE) +DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'plate', pt.size = 2, dims = c(1,2), label = FALSE) +DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'condition', cols = cols_condition, pt.size = 3, dims = c(1,2), label = FALSE) +FeaturePlot(object = sceall_seurat , reduction = "umap", features = 'nFeature_RNA', pt.size = 1, dims = c(1,2), label = FALSE) +FeaturePlot(object = sceall_seurat , reduction = "umap", features = 'percent.mito', pt.size = 2, dims = c(1,2), label = FALSE) + +cols_clustersrun123 <- c('late_Ery' = "#D9D9D9", 'Mast' = "#FDB462", 'EMP' = "#B3DE69", 'MacP' = "#BEBADA", 'Mk_I' = "#BC80BD", 'mid_Ery' = "#FFFFB3", 'Mk_II' = "#8DD3C7", 'early_Ery' = "#FCCDE5", 'GMP' = "#80B1D3", 'Mk_III' = "#8A8A8A", 'pMac' = "#FB8072") +cols_clustersrun123 <- structure(.Data=alpha(cols_clustersrun123, .8), .Names=names(cols_clustersrun123)) + +cols_progenitors <- c('CMP' = "#d570fa", 'GMP' = "#4e9dfc", 'MEP' = "#ed5a32", 'P1' = "#eda532", 'NA'="white") +cols_progenitors <- structure(.Data=alpha(cols_progenitors, .4), .Names=names(cols_progenitors)) + + +#DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'cluster_libraryRun123', cells = colnames(sceall_seurat)[which(sceall_seurat$cluster_libraryRun123 != 'NA')], cols = cols_clustersrun123, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Cluster Annotations from All Cells Run123bycond "))#, + +cols <- c(cols_condition, cols_clustersrun123, cols_progenitors) + + +# Cells to use +E10.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>% + filter(condition == "E10.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist + +E12.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>% + filter(condition == "E12.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist + +DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'progenitors', cells = E10.5FL_cells, cols = cols_progenitors, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Progenitor Annotations from E10.5 FL Index Sort"))#, + +DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'progenitors',cells = E12.5FL_cells, cols = cols_progenitors, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Progenitor Annotations from E12.5 FL Index Sort"))#, + +DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'progenitors', cols = cols_progenitors, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Progenitor Annotations from FL Index Sort"))#, + + +#nFeature with spectral scale +sceall_seurat[['log2_nCount_RNA']] <- log2(sceall_seurat[['nCount_RNA']]) +ggplot(data = data.frame(UMAP_1 = sceall_seurat@reductions$umap@cell.embeddings[,1], + UMAP_2 = sceall_seurat@reductions$umap@cell.embeddings[,2], + log2_nCount_RNA = sceall_seurat[['log2_nCount_RNA']])) + geom_point(aes(UMAP_1, UMAP_2, col = log2_nCount_RNA), size = 2) + theme_classic() + scale_color_gradientn(colors = rev(RColorBrewer::brewer.pal(10, "Spectral"))) + + +``` + + + + + +# Clustering + + + +## Clustering using SNN-graph and random walks using the `scran` and `igraph` packages + +A shared nearest-neighbor graph (SNN) is built using the `buildSNNGraph` function of the `scran` package. +Then communities, which are densely connected subgraphs, are searched for in the SNN graph using the method of random walks implemented in the `cluster_walktrap` function of the `igraph` package. +See Pascal Pons, Matthieu Latapy: Computing communities in large networks using random walks, http://arxiv.org/abs/physics/0512106. + + +### Cluster modularity + +We look at the ratio of the observed and expected edge weights to visualize the modularity of the clusters. + + + + +```{r compute SNN clusters, fig.width=2.5, fig.height=2} +#We can verify this by clustering on the corrected low-dimensional values using a graph-based method (Xu and Su 2015). + +g10 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 10) +sceall_seurat$snn_scran_k10 <- igraph::cluster_walktrap(g10, steps = 5)$membership +g11 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 11) +sceall_seurat$snn_scran_k11 <- igraph::cluster_walktrap(g11, steps = 5)$membership +g12 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 12) +sceall_seurat$snn_scran_k12 <- igraph::cluster_walktrap(g12, steps = 5)$membership +g13 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 13) +sceall_seurat$snn_scran_k13 <- igraph::cluster_walktrap(g13, steps = 5)$membership + +par(mfrow=c(3,2)) +cluster.mod <- clusterModularity(g10, sceall_seurat$snn_scran_k10, get.weights=TRUE) +log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1) +p1 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, + color=colorRampPalette(c("white", "blue"))(100), main = 'k = 10') +cluster.mod <- clusterModularity(g11, sceall_seurat$snn_scran_k11, get.weights=TRUE) +log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1) +p2 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, + color=colorRampPalette(c("white", "blue"))(100), main = 'k = 11') +cluster.mod <- clusterModularity(g12, sceall_seurat$snn_scran_k12, get.weights=TRUE) +log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1) +p3 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, + color=colorRampPalette(c("white", "blue"))(100), main = 'k = 12') +cluster.mod <- clusterModularity(g13, sceall_seurat$snn_scran_k13, get.weights=TRUE) +log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1) +p4 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, + color=colorRampPalette(c("white", "blue"))(100), main = 'k = 13') + + +``` + + + + +```{r plot snn clusters, fig.width=14, fig.height = 12} + +ggclusterk10 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k10', cols = 'Set3') + ggtitle('k=10') +ggclusterk11 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k11', cols = 'Set3') + ggtitle('k=11') +ggclusterk12 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k12', cols = 'Set3') + ggtitle('k=12') +ggclusterk13 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k13', cols = 'Set3') + ggtitle('k=13') + +multiplot(ggclusterk10, ggclusterk11, ggclusterk12, ggclusterk13, layout=matrix(c(1,2,3,4), nrow=2, byrow=TRUE)) + +``` + + +## Clustering obtained with random walks on a SNN graph using $k=12$ + +We use the clustering obtained with random walks on a SNN graph using $k=12$ nearest neighbors. + +```{r clustering k12, fig.width=12, fig.height=10} + +# extract colors used for each cluster in the previous plots +#dcol=unique(ggplot_build(ggclusterk12)$data[[1]][,c('colour','group')]); +#dcol=dcol[order(dcol$group),] +#dcol <- structure(.Data = dcol$colour, .Names = dcol$group)#-1) +# add them to color annotations +#cols <- dcol + +# Rename idents to place cluster number in order from EMP>Mye>Ery>Mk for sake of discussion in the paper +Idents(sceall_seurat) <- "snn_scran_k12" +sceall_seurat <- RenameIdents(sceall_seurat, '1'="5", '2'="6", '3'="2", '4'="9", '5'="7", '6'="4", '7'="1", '8'="8", '9'="3", '10'="10") + + +sceall_seurat$snn_scran_k12 <- paste(Idents(sceall_seurat)) +clus2ident <- c('1'="EMP", '2'="MP", '3'="GP", '4'="Mac", '5'="Mast", '6'='EP', '7'="early_Ery", '8'="mid_Ery", '9'="late_Ery", '10'="MkP") + +dircluster <- paste0(dirdata,'derived/FL/cluster_markers/') +dir.create(dircluster) + +clusteringchoice <- 'snn_scran_k12' +filelabel <- "snnRandomWalksClustering_k12" +plottitle <- 'k=12' + +Idents(sceall_seurat) <-clus2ident[as.character(sceall_seurat[[clusteringchoice]][,1])] +Idents(sceall_seurat) <- factor(Idents(sceall_seurat), levels = c("EMP", "MP", "GP", "Mac", "Mast", "EP", "early_Ery", "mid_Ery", "late_Ery", "MkP")) +sceall_seurat$clusterIdents <- sceall_seurat$knn_clusters <- Idents(sceall_seurat) + +#set cluster colors +cols_knn_clusters <- c('EMP' = "#B3DE69", 'MP' = "#BEBADA", 'GP' = "#80B1D3", 'Mac' = "#FB8072", 'Mast' = "#FDB462", 'EP' = "#8DD3C7", 'early_Ery' = "#FCCDE5", 'mid_Ery' = "#FFFFB3", 'late_Ery' = "#D9D9D9", 'MkP' = "#BC80BD") +cols_knn_clusters <- structure(.Data=alpha(cols_knn_clusters, .8), .Names=names(cols_knn_clusters)) + +cols_snn_scran12 <- c('1' = "#B3DE69", '2' = "#BEBADA", '3' = "#80B1D3", '4' = "#FB8072", '5' = "#FDB462", '6' = "#8DD3C7", '7' = "#FCCDE5", '8' = "#FFFFB3", '9' = "#D9D9D9", '10' = "#BC80BD") +cols_snn_scran12 <- structure(.Data=alpha(cols_snn_scran12, .8), .Names=names(cols_snn_scran12)) + + +cols_knn_clusters2 <- c('EMP' = "#e6c9ff", 'MP' = "#8a90ff", 'GP' = "#8fc1ff", 'Mac' = "#323e99", 'Mast' = "#f7c868", 'EP' = "#ffc9ba", 'early_Ery' = "#ff8969", 'mid_Ery' = "#fa435f", 'late_Ery' = "#ad0720", 'MkP' = "#fa7602") +cols_knn_clusters2 <- structure(.Data=alpha(cols_knn_clusters2, .6), .Names=names(cols_knn_clusters2)) + + + +cols <- c(cols, cols_knn_clusters) + +sceall <- as.SingleCellExperiment(sceall_seurat) + +sceall_seurat_RNAraw <- sceall_seurat; +sceall_seurat_RNAraw@active.assay <- 'RNA_raw' +sceall_RNAraw <- as.SingleCellExperiment(sceall_seurat, assay = 'RNA_raw') + +``` + + +```{r plot UMAP of picked clustering, fig.width=10, fig.height=7, dpi=600} + + +# plot clusters with legend +# reorder legend +sceall_seurat$knn_clusters <- factor(sceall_seurat$knn_clusters, levels=c("EMP", "MP", "GP", "Mac", "Mast", "EP", "early_Ery", "mid_Ery", "late_Ery", "MkP")) + +DimPlot(sceall_seurat, reduction = "umap", group.by = "knn_clusters", pt.size = 1.5, label = FALSE, label.size=6, cols=cols_knn_clusters2) + ggtitle(paste0("UMAP : Random Walk Clustering, ", plottitle)) + +DimPlot(sceall_seurat, reduction = "umap", group.by = "knn_clusters", pt.size = 1.5, label = TRUE, label.size=6, cols=cols_knn_clusters2) + ggtitle(paste0("UMAP : Random Walk Clustering, ", plottitle)) +``` + +```{r plot UMAP of FACS assigned gates, fig.width=10, fig.height=7, dpi=600} + + +DimPlot(sceall_seurat, reduction = "umap", group.by = "progenitors", pt.size = 1.5, label = FALSE, cols=cols_progenitors) + ggtitle(paste0("UMAP :", 'progenitor gate')) + +``` + + +```{r plot interactive UMAP plot of picked clustering, fig.width=10, fig.height=7, results=TRUE} +library(plotly) +g <- ggplot(as.data.table(cbind(sceall_seurat@reductions$umap@cell.embeddings, sceall_seurat@meta.data))) + geom_point(aes(x=UMAP_1, y=UMAP_2, color=clusterIdents, text=paste("cell:", cellID, "\nrun:", run)), size=.9) + scale_color_manual(values =cols_knn_clusters) +ggplotly(g) +``` + + +```{r function to summarize in which clusters a marker is found to be significant} +# for each gene, extract the number of clusters in which it is detected as a marker + +summarizeMarkerSpecificity <- function(markers.out, fdr_cutoff) { + + markers_sel <- lapply(markers.out, + function(x, fdr_cutoff) + subset(x, FDR < fdr_cutoff), fdr_cutoff=fdr_cutoff) + allmarkergenes <- unique(unlist(lapply(markers_sel, rownames))) + + ismarker <- mapply(x=markers_sel, + clus=names(markers_sel), + function(x,clus) { + xx <- x[allmarkergenes,c('FDR'),drop=F] + #colnames(xx) <- paste0(clus,'FDR') + rownames(xx) <- allmarkergenes + return(xx) + }) + + ismarker <- do.call(cbind, ismarker) + ismarker_boolean <- as.matrix(ismarker) < fdr_cutoff + ismarker_boolean[is.na(ismarker_boolean)] <- FALSE + ismarker_boolean <- as.data.table(ismarker_boolean) + ismarker_boolean[, nbClusters := rowSums(ismarker_boolean)] + ismarker_boolean[, gene := rownames(ismarker)] + ismarker_boolean <- ismarker_boolean[order(ismarker_boolean$nbClusters),] + ismarker_boolean <- ismarker_boolean[, c("gene", 'nbClusters',setdiff(colnames(ismarker_boolean), c('gene','nbClusters'))), with=FALSE] + ismarker_boolean[, whichClusters := paste0(sort(setdiff(colnames(ismarker_boolean), c('gene','nbClusters'))[which(unlist(.SD) == "TRUE")]), collapse=','), .SDcols = setdiff(colnames(ismarker_boolean), c('gene','nbClusters')), by = gene] + + ismarker_info <- cbind(ismarker_boolean[, c('gene','nbClusters', 'whichClusters')], ismarker[match(ismarker_boolean$gene, rownames(ismarker)),]) + + ismarker_info$whichClusters <- gsub('.FDR', '', ismarker_info$whichClusters) + + return(ismarker_info) + + +} + +``` + + + +### Cluster markers using pairwise $t$-tests and keeping only markers differential in at least half of the pairwise comparisons between clusters + +#### Using the dataset post-filtering + + +Markers of the clusters are obtained using the function `findMarkers` (which by default performs a t-test) and are written in the folder `/data/derived/FL/cluster_markers/`. +In this version, we keep genes that are found to be differential in at least half of the pairwise comparisons between clusters, i.e. the combined p-value across comparisons is chosen to be the "middle-most" p-value of all tests (cluster comparisons) performed for a given gene. This is achieved using the option `pval.type="some"` in the `findMarkers` function. +Results are available in the file `r paste0(dircluster,'FL_markersDiffInHalfComparisons_',filelabel,'.xlsx')`. + + +```{r find half specific markers with findMarkers} + +fdr_threshold <- 0.01 + +# write markers without using run as a blocking variable +markers.out <- findMarkers(sceall, sceall[[clusteringchoice]], #block=sceall$groupForDoubletDetection,#snn_scran_k15 + direction="up", assay.type="logcounts", pval.type = 'some') + +names(markers.out) <-clus2ident[as.character(names(markers.out))] + +markers.out <- lapply(markers.out, function(x) { + colnames(x) <- bayesbio:::mgsub(pattern=paste0('logFC.',1:length(unique(sceall_seurat[[clusteringchoice]][,1]))), + replacement = paste0('logFC.', clus2ident), + colnames(x)); + x}) + +markers_spe <- summarizeMarkerSpecificity(markers.out, fdr_cutoff = fdr_threshold) +markers_spe <- cbind(markers_spe, geneAnnot[match(markers_spe$gene,geneAnnot$mgi_symbol),]) + +c(list(summary=as.data.frame(markers_spe)),lapply(markers.out, function(x) as.data.frame(x[x$FDR < fdr_threshold,]) %>% cbind(gene=rownames(.), geneAnnot[match(rownames(.),geneAnnot$mgi_symbol),], .))) %>% + openxlsx::write.xlsx(., file = paste0(dircluster,'FL_markersDiffInHalfComparisons_',filelabel,'.xlsx'), asTable = TRUE, row.names = TRUE) + +``` + + + +```{r upset plots, fig.width=8, fig.height=6} +# look at intersection between genes associated with the clusters using an Upset plot + +forupset <- do.call(cbind,lapply(markers.out, function(x) as.integer(rownames(sceall) %in% rownames(x)[x$FDR < fdr_threshold & !is.na(x$FDR)]))) +rownames(forupset) <- rownames(sceall) +# order clusters +forupset <- forupset[,levels(sceall$clusterIdents)] +UpSetR::upset(data=data.frame(forupset), + line.size = 0.3, point.size = 2, + sets = levels(sceall$clusterIdents), + keep.order = TRUE, + order.by = c('freq'), + sets.x.label = '#markers', + number.angles = 0) # + + + +``` + +```{r count half-spe markers diff in all comparisons, results='asis'} +data.frame(nbDiffInHalfComparisons=sapply(markers.out, function(x) sum(x$FDR < fdr_threshold))) %>% knitr::kable() %>% print() +``` + + +The markers in the top 5 for each cluster are shown in the heatmap plot. + +```{r heatmap plot half specific markers, fig.width=12, fig.height=8} +sceall_seurat <- ScaleData(sceall_seurat) + +DoHeatmap(sceall_seurat, group.by = 'knn_clusters', features = unique(unlist(lapply(markers.out, function(x) rownames(x[x$FDR < .01,])[1:5]))), group.colors = cols_knn_clusters2) + ggtitle(plottitle) + + theme(text = element_text(size = 16)) + + scale_fill_gradientn(colors = colorRampPalette(c("#033080", "#ffffe3", "#edea13"))(256)) + +``` + + +UMAP plot of the top 5 markers for each cluster is shown below. + +```{r feature plot top middle specific markers, fig.width=18, fig.height=20} +FeaturePlot(sceall_seurat, features = unique(unlist(lapply(markers.out, function(x) rownames(x[x$FDR < .01,])[1:5]))), ncol = 6, pt.size=0.5) + +``` + + + +#### Enrichment analysis of the cluster markers + +Using the genes differential between each cluster and at least half of the other clusters (with FDR < 1%), we perform an enrichment/over-representation analysis using the MSigDB (http://software.broadinstitute.org/gsea/msigdb) and the GeneSetDB databases (https://www.genesetdb.auckland.ac.nz/haeremai.html). The number of top genes used can be adapted depending on the obtained results. + +The top 10 enriched gene sets of a subset of collections of gene sets are shown below. +The complete results are stored in the folders`data/derived/FL/cluster_markers/ORA_pathways_MsigDB/` + + +```{r pathway analysis} + +library(EGSEA) +library(openxlsx) + + +# using MSigDB +resora <- foreach(res = markers.out, + clusterind = names(markers.out)) %do% { + + + #geneids <- rownames(res[res$Top %in% 1:150 & res$FDR < .001,]) + geneids <- rownames(res[res$FDR < .01,]) + + if (length(geneids) > 0) { + + logfc <- structure(#.Data = rowMeans(as.matrix(res[geneids,-c(1:3)]), na.rm = TRUE), + .Data = apply(as.matrix(res[geneids,-c(1:3)]), 1, function(x) x[which.max(abs(x))]), + .Names = geneids) + + entrezids <- as.character(geneAnnot[match(tolower(geneids), tolower(geneAnnot$mgi_symbol)),]$entrezgene_id ) + logfc <- logfc[!is.na(entrezids)] + names(logfc) <- entrezids[!is.na(entrezids)] + + genes <- geneAnnot[match(tolower(rownames(res)),tolower(geneAnnot$mgi_symbol)), c('entrezgene_id', 'mgi_symbol')] + colnames(genes) <- c('FeatureID', 'Symbols') + genes <- na.omit(genes) + + gs.annots_ora = buildIdx(entrezIDs = names(logfc), species = "mouse", gsdb.gsets = "all", kegg.exclude = TRUE) + ora = egsea.ora(geneIDs=names(logfc), + universe= na.omit(geneAnnot$entrezgene_id), + logFC =as.matrix(logfc), + title=clusterind, + gs.annots=gs.annots_ora, + symbolsMap=genes[match(names(logfc), genes$FeatureID),], + num.threads = 2, + report = F, + report.dir = '') + } + else { + ora <- NA + } + return(ora) + + } + +names(resora) = names(markers.out) + +# write in excel tables +dir.create(paste0(dircluster,'ORA_pathways_MsigDB/')) +foreach( comp = names(resora)) %do% { + getres <- mapply(res = resora[[comp]]@results, + degenes = resora[[comp]]@gs.annots, + function(res, degenes) { + res$test.results[[1]] <- subset(res$test.results[[1]], p.adj < .25) + if(nrow(res$test.results[[1]]) > 100) res$test.results[[1]] <- res$test.results[[1]][1:100,] + cbind(res$test.results[[1]], + degenes@anno[match(rownames(res$test.results[[1]]), rownames(degenes@anno)),], + genes=sapply(degenes@idx[match(rownames(res$test.results[[1]]), names(degenes@idx))], + function(x, degenes) paste(geneAnnot$mgi_symbol[match(degenes@featureIDs[x],geneAnnot$entrezgene_id)], collapse=',') + , degenes = degenes)) + }, SIMPLIFY = FALSE) + + + #unlist(lapply(getres, function(x) sum(x$p.adj < .1))) + openxlsx::write.xlsx(getres, file = paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), row.names=TRUE) +} + + +``` + + +```{r pathway ORA top 10 for half specific markers, results='asis'} + +topora <- foreach (comp = names(markers.out)) %do% { + + reg <- readxl::read_xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), + sheet = 'gsdbreg') + go <- readxl::read_xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), + sheet = 'gsdbgo') + path <- readxl::read_xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), + sheet = 'gsdbpath') + + reg <- subset(reg, p.adj < .05) + go <- subset(go, p.adj < .05) + path <- subset(path, p.adj < .05) + + cat(paste0('\n\n##### ',comp, ' \n\n') ) + #%>% knitr::kable() %>% print() + reg[1:min(10,nrow(reg)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print() + go[1:min(10,nrow(go)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print() + path[1:min(10,nrow(path)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print() +} +``` + + +```{r plot ORA results 1, fig.width=16, fig.height=16} +# plot the results of over-enrichment analysis +# returns a list containing a ggplot object and the table used to make the plot +plotORA<- function(cluster, # cluster label + resora, # table containing results of the ORA analysis + padj_threshold, + fileplot=NULL, + drawplot=FALSE, # should the plot be drawn for each cluster + facet_ncol=1, maxGeneSetsShown = 10, + selcat=c("c2", "c3", "c4", "c5", "c6", "gsdbdrug", "gsdbpath", "gsdbdis", "gsdbgo", "kegg"), ...) { + + getres <- lapply(resora[selcat], + function(res) { + if (nrow(res) >0 ) { + res <- subset(res, p.adj< padj_threshold) + if (nrow(res) > maxGeneSetsShown) + res <- res[1:maxGeneSetsShown,] + return(res) + } + }) + + # remove categories with no results + ind <- which(sapply(getres, function(x) if (!is.null(x)) nrow(x) else 0)>0) + + if (length(ind) > 0) getres <- getres[ind, drop=FALSE] else next + + # add category to results before merging + getres <- mapply(gr=getres,ngr=names(getres), function(gr,ngr) {gr$category <- ngr; gr}, SIMPLIFY = FALSE) + cn <- sapply(getres, function(x) {colnames(x)}) + cn <- setdiff(names(which(table(unlist(cn)) == max(table(unlist(cn))))), "") + cn <- c('GeneSet', setdiff(cn, "GeneSet")) + # aggregate results into one table + getresmergesel <- do.call(rbind, lapply(getres, function(x) x[,cn, drop=FALSE])) + getresmergesel$cluster <- cluster + + getresmergesel %>% .[order(.$category,.$p.adj, decreasing = TRUE),] + getresmergesel$GeneSet <- toupper(getresmergesel$GeneSet) + getresmergesel$GeneSet <- factor(getresmergesel$GeneSet, levels = unique(getresmergesel$GeneSet)) + getresmergesel$RatioGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1])/as.numeric(x[2])) + getresmergesel$NbGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1])) + + if (nrow(getresmergesel) > 0) { + gg <- ggplot(getresmergesel, + aes(y=-log10(p.adj), x=GeneSet, fill=RatioGenes)) + + ggforce::facet_col(~category, scales = 'free_y', space='free') + + #facet_wrap(~category, scales = 'free')+ + coord_flip() + + xlab("") + ylab("-log10(adj.pval)") + ggtitle(cluster)+ + theme_bw()+ + geom_bar(stat='identity', position="dodge") + coord_flip() + + if (drawplot) { + if (!is.null(fileplot)) pdf(fileplot, ...) + print(gg) + if (!is.null(fileplot)) dev.off() + } + + return(list(plot=gg, dataplot=getresmergesel)) + } +} + + +readAllORA<- function(resora_files, + resora_names, + padj_threshold, + maxGeneSetsShown = NULL) { + + # for each cluster, read all gene set results + allresora <- + mapply(resora_file = resora_files, + resora_name = resora_names, + function(resora_file, resora_name) { + genesets <- openxlsx::getSheetNames(resora_file) + getres <- sapply(genesets, function(gs) openxlsx::read.xlsx(resora_file, sheet = gs), simplify = FALSE) + + # remove categories with no results + ind <- which(sapply(getres, function(x) if (!is.null(x)) nrow(x) else 0)>0) + if (length(ind) > 0) getres <- getres[ind, drop=FALSE] else return(NULL) + + # add category to results before merging + getres <- mapply(gr=getres,ngr=names(getres), + function(gr,ngr) { + gr$category <- ngr; + gr$rank <- order(gr$p.value) + gr + }, SIMPLIFY = FALSE) + cn <- sapply(getres, function(x) {colnames(x)}) + cn <- setdiff(names(which(table(unlist(cn)) == max(table(unlist(cn))))), "") + cn <- c('GeneSet', setdiff(cn, "GeneSet")) + # aggregate results into one table + getresmergesel <- do.call(rbind, lapply(getres, function(x) x[,cn, drop=FALSE])) + getresmergesel$cluster <- resora_name + + getresmergesel %>% .[order(.$category,.$p.adj, decreasing = TRUE),] #"gsdbgo","gsdbpath", "c2" + getresmergesel$GeneSet <- toupper(getresmergesel$GeneSet) + getresmergesel$GeneSet <- factor(getresmergesel$GeneSet, levels = unique(getresmergesel$GeneSet)) + getresmergesel$RatioGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1])/as.numeric(x[2])) + getresmergesel$NbGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1])) + + return(getresmergesel) + + }, SIMPLIFY = FALSE) + + # merge results from all clusters in one data.frame + allresora <- do.call(rbind, allresora) + + # select gene sets significant in at least one cluster + if (is.null(maxGeneSetsShown)) { + signgs <- unique(allresora[allresora$p.adj < padj_threshold,]$GeneSet) + } else { + signgs <- unique(allresora[allresora$p.adj < padj_threshold & allresora$rank %in% 1:maxGeneSetsShown,]$GeneSet) + + } + + + + + allresora <- allresora[allresora$GeneSet %in% signgs,] + + return(allresora) + +} + +padj_threshold <- 0.05 +maxGeneSetsShown <- 15 + + +``` + +* The plots of the top `r maxGeneSetsShown` or all gene sets with adjusted p-value lower than `r padj_threshold` are saved in in `r paste0(dircluster,"/ORA_pathways_MsigDB/plots/")`. + + + +```{r plot ORA results 2, fig.width=8, fig.height=12} + +dir.create(paste0(dircluster,"ORA_pathways_MsigDB/plots/")) + +padj_threshold <- 0.05 +maxGeneSetsShown <- 15 + +# this will build one plot per cluster for all selected categories of gene sets +plotperclus <- foreach (comp = levels(Idents(sceall_seurat))) %do% { + + genesets <- openxlsx::getSheetNames(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx")) + + resora <- sapply(genesets, function(gs) openxlsx::read.xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), sheet = gs)) + +gora <- plotORA(cluster = comp, + resora = resora, + padj_threshold = padj_threshold, + fileplot = NULL, + drawplot= FALSE,#paste0("plots/voomWithQualityWeight_withflowcell_ORA_",region,'.pdf'), + facet_ncol = 2, + maxGeneSetsShown = maxGeneSetsShown, + selcat=c("gsdbpath", "gsdbreg", "gsdbgo", "kegg"), + width=20, height=15) +#selcat = c("gsdbpath", "c5", "kegg"), +return(gora) +} + +# plot them all individually, grouped by cluster +#do.call(gridExtra::grid.arrange, c(lapply(plotperclus, function(x) x$plot), nrow=4)) + +# ==== Plot by category ==== +# we merge the results tables for each cluster into one big table such that we can plot all clusters on a same plot +intcol <- names(which(table(unlist(sapply(plotperclus, function(x) colnames(x$dataplot)))) == length(plotperclus))) +alloraressel <- do.call(rbind, lapply(plotperclus, function(x) x$dataplot[,intcol])) + +alloraressel$cluster <- factor(alloraressel$cluster, levels = levels(Idents(sceall_seurat))) + +alloraressel$GeneSet <- as.character(alloraressel$GeneSet) + +foreach (catsel=c("gsdbpath", "gsdbreg", "gsdbgo", "kegg"), + width = c(12, 6, 12, 8), + height = c(23, 10, 23, 20) + ) %do% { +catplot <- + alloraressel[alloraressel$category == catsel,] %>% + group_by(cluster, category) %>% + arrange(desc(-1/RatioGenes)) %>% + ungroup() %>% + mutate( GeneSet = factor(paste(GeneSet, cluster, sep='__'), + levels = rev(paste(GeneSet, cluster, sep='__')))) %>% + ggplot(aes(y=RatioGenes, x=GeneSet, + fill=-log10(p.adj), + )) + + theme_bw() + + geom_bar(stat='identity', position="dodge") + + ggforce::facet_col(~cluster, scales = 'free_y', space = 'free') + + scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) + + coord_flip() + ylab("% of genes from gene set that are DE") + + ggtitle(label = catsel, subtitle = paste0(" adjusted p-value < ", padj_threshold, ", top ", maxGeneSetsShown)) + + if (catsel == 'gsdbreg') print(catplot) + +ggsave(plot = catplot, + filename = paste0(dircluster,"ORA_pathways_MsigDB/plots/", catsel, "_orderByGeneRatio_padj", padj_threshold,"_", "top", maxGeneSetsShown, ".pdf"), + width = width, height = height, dpi = 300) + + +} +``` + + +```{r plot ORA results as circles, fig.width=9, fig.height=6} + +dir.create(paste0(dircluster,"ORA_pathways_MsigDB/plots/")) + + + +# ==== Plot showing all clusters at once ==== + +#padj_threshold <- 0.05 +maxGeneSetsShown <- 15 +# maxGeneSetsShown <- 10 +cat2plot <- c("gsdbpath", "gsdbreg", "gsdbgo", "kegg") + +## === showing top x (maxGeneSetsShown) only === +resallora <- readAllORA(resora_files = paste0(dircluster,"/ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',names(markers.out)),".xlsx"), + resora_names = names(markers.out), + padj_threshold = padj_threshold, + maxGeneSetsShown = maxGeneSetsShown) + +resallora$cluster <- factor(resallora$cluster, levels = levels(Idents(sceall_seurat))) + +foreach (catsel=cat2plot) %do% { + catplot <- + resallora %>% + filter (category == catsel & p.adj < padj_threshold) %>% + group_by(cluster) %>% + arrange(cluster, p.value) %>% + ungroup() %>% + mutate( GeneSet = factor(GeneSet, + levels = rev(unique(GeneSet)))) %>% + ggplot(aes(y=cluster, x=GeneSet)) + + theme_bw() + + geom_point(aes(size=RatioGenes, colour = -log10(p.adj))) + + scale_x_discrete(labels = function(x) substr(gsub("__.+$", "", x), start = 1, stop = 80)) + + coord_flip() + + ggtitle(label = catsel, subtitle = paste0(" adjusted p-value < ", padj_threshold, ", top ", maxGeneSetsShown)) + ylab('') + + ggsave(plot = catplot, + filename = paste0(dircluster,"/ORA_pathways_MsigDB/plots/", catsel, "_circles_padj", padj_threshold,"_", "top", maxGeneSetsShown, '.pdf'), + width = if (catsel %in% c('gsdbgo','gsdbpath')) 15 else 10, height = if (catsel %in% c('gsdbgo','gsdbpath')) 13 else 7, dpi = 300) + + + +} + + +## === showing all below a certain adjusted p-value threshold === +resallora <- readAllORA(resora_files = paste0(dircluster,"/ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',names(markers.out)),".xlsx"), + resora_names = names(markers.out), + padj_threshold = padj_threshold, + maxGeneSetsShown = NULL) + +resallora$cluster <- factor(resallora$cluster, levels = levels(Idents(sceall_seurat))) + +foreach (catsel=cat2plot) %do% { + catplot <- + resallora %>% + filter (category == catsel & p.adj < padj_threshold) %>% + group_by(cluster) %>% + arrange(cluster, p.value) %>% + ungroup() %>% + mutate( GeneSet = factor(GeneSet, + levels = rev(unique(GeneSet)))) %>% + ggplot(aes(y=cluster, x=GeneSet)) + + theme_bw() + + geom_point(aes(size=RatioGenes, colour = -log10(p.adj))) + + scale_x_discrete(labels = function(x) substr(gsub("__.+$", "", x), start = 1, stop = 80)) + + coord_flip() + + ggtitle(label = catsel, subtitle = paste0(" adjusted p-value < ", padj_threshold)) + ylab('') + + ggsave(plot = catplot, + filename = paste0(dircluster,"/ORA_pathways_MsigDB/plots/", catsel, "_circles_padj", padj_threshold,"_all.pdf"), + width = if (catsel %in% c('gsdbgo','gsdbpath')) 20 else 10, height = if (catsel %in% c('gsdbgo','gsdbpath')) 20 else 6, dpi = 300) + + + +} + + + + +``` + + + +# Cell cycle quantification + +Using the `cyclone` function of the `scran` package. + + +```{r cyclone on snn random clusters, fig.width=8, fig.height=8} + +library(org.Mm.eg.db) +ensembl <- mapIds(org.Mm.eg.db, keys=rownames(sceall_seurat), keytype="SYMBOL", column="ENSEMBL") + +mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", + package="scran")) + +assignments <- cyclone(as.SingleCellExperiment(sceall_seurat), mm.pairs, gene.names=ensembl, assay.type="logcounts") + + +# +sceall_seuratbis <- sceall_seurat +sceall_seuratbis$G1 <- assignments$score$G1 +sceall_seuratbis$G2M <- assignments$score$G2M +sceall_seuratbis$S <- assignments$score$S +sceall_seuratbis$cyclone_cellcycle <- assignments$phases + + +``` + +```{r vln plots cyclone by cluster, fig.width=8, fig.height=8} + +vs1 <- VlnPlot(object = sceall_seuratbis , features = 'G1', group.by = 'knn_clusters', cols=cols_knn_clusters)+ggtitle('G1')+xlab('')+ylab('G1 Phase Score')+ geom_boxplot(outlier.shape = NA)+theme(legend.position="none") +vs3 <- VlnPlot(object = sceall_seuratbis , features = 'G2M', group.by = 'knn_clusters', cols=cols_knn_clusters)+ggtitle('G2M')+xlab('')+ylab('G2M Phase Score')+ geom_boxplot(outlier.shape = NA)+theme(legend.position="none") +vs2 <- VlnPlot(object = sceall_seuratbis , features = 'S', group.by = 'knn_clusters', cols=cols_knn_clusters)+ggtitle('S')+xlab('')+ylab('S Phase Score')+ geom_boxplot(outlier.shape = NA)+theme(legend.position="none") + print(gridExtra::marrangeGrob(list(vs1,vs2,vs3),ncol = 1, nrow=3, top='cyclone cell cycle Phase score vs. clusters for k=12')) + + +``` + + +```{r color umap by cyclone cell cycle stage, fig.width=7, fig.height=6} + +cols_cyclone <- c('G1' = "#FB8072", 'G2M' = "#B3DE69", 'S' = "#FDD662") + +d12umap_cyclone <- DimPlot(object = sceall_seuratbis , reduction = "umap", group.by = 'cyclone_cellcycle', cols=cols_cyclone, pt.size = 1, dims = c(1,2)) + + ggtitle('') #Colored by Cell cycle phase + + +print(gridExtra::marrangeGrob(list(d12umap_cyclone),ncol = 1, nrow=1, #,d12tsne_cyclone + top='Assignment of cell cycle Phase made by cyclone')) + + +sceall_seurat <- sceall_seuratbis + +``` + + + +# Proportion of Condition, Cell Cycle Phase, and Progenitor gates among Clusters + + +```{r proportion cell cycle phase among clusters, echo=F, warning=F, message=F, results=T, fig.height=2, fig.width=5} + +#cyckone cell cycle phase among clusters +scale(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters), scale=colSums(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters)), center = FALSE) %>% reshape2::melt() %>% mutate(.,cellcycle=Var1) %>% mutate(cluster=Var2) %>% ggplot(., aes(x=cluster, y =value, fill=cellcycle)) + geom_bar(stat = 'identity') + scale_fill_manual(values = cols_cyclone) + + +scale(table(sceall_seurat$condition, sceall_seurat$knn_clusters), scale=colSums(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters)), center = FALSE) %>% reshape2::melt() %>% mutate(.,condition=Var1) %>% mutate(cluster=Var2) %>% ggplot(., aes(x=cluster, y =value, fill=condition)) + geom_bar(stat = 'identity') + scale_fill_manual(values = c('E10.5_Fetal_Liver'="#e6dc85", 'E12.5_Fetal_Liver'="#97d2db")) + + +scale(table(sceall_seurat$condition, sceall_seurat$knn_clusters), scale=colSums(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters)), center = FALSE) %>% reshape2::melt() %>% mutate(.,condition=Var1) %>% mutate(cluster=Var2) %>% ggplot(., aes(x=cluster, y =value, fill=condition)) + geom_bar(stat = 'identity') + scale_fill_manual(values = c('E10.5_Fetal_Liver'="#e6dc85", 'E12.5_Fetal_Liver'="#97d2db")) + +scale(table(sceall_seurat$knn_clusters, sceall_seurat$progenitors), scale=colSums(table(sceall_seurat$knn_clusters, sceall_seurat$progenitors)), center = FALSE) %>% reshape2::melt() %>% mutate(cluster=Var1) %>% mutate(progenitor=Var2) %>% ggplot(., aes(x=progenitor, y =value, fill=cluster)) + geom_bar(stat = 'identity') + scale_fill_manual(values = cols_knn_clusters) + + +``` + +## Gata1 and Sfpi1 expression + + +Cells are ordered with respect to Gata1 expression. + +```{r, fig.width=5, fig.height=4} +# Cells to use +E10.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>% + filter(condition == "E10.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist + +E12.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>% + filter(condition == "E12.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist + +barplot(sceall_seurat@assays$RNA@data['Gata1',][order(sceall_seurat@assays$RNA@data['Gata1',])], col = alpha('turquoise',.5), border = NA, names.arg = FALSE) +barplot(sceall_seurat@assays$RNA@data['Sfpi1',][order(sceall_seurat@assays$RNA@data['Gata1',])], col = alpha('blue',.7), border = NA, names.arg = FALSE, add=TRUE) +legend('top',legend=c('Gata1','Spfi1'), pch=16, col=c('turquoise', 'blue')) + + +``` + +### Number of cells expressing both, neither, or only one of Gata1 and Sfpi1 + +```{r} +thresholdExpr <- 1 +``` + +```{r build expr categories, results=TRUE} + + bothExpr <- tibble(cellID=colnames(sceall_seurat), + Gata1=sceall_seurat@assays$RNA@data['Gata1',], + Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% + filter(., Gata1 > thresholdExpr & Sfpi1 > thresholdExpr) + bothExpr <- bothExpr$cellID + + onlyGata1 <- tibble(cellID=colnames(sceall_seurat), + Gata1=sceall_seurat@assays$RNA@data['Gata1',], + Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% + filter(., Gata1 > thresholdExpr & Sfpi1 <= thresholdExpr) + onlyGata1 <- onlyGata1$cellID + + onlySfpi1 <- tibble(cellID=colnames(sceall_seurat), + Gata1=sceall_seurat@assays$RNA@data['Gata1',], + Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% + filter(., Gata1 <= thresholdExpr & Sfpi1 > thresholdExpr) + onlySfpi1 <- onlySfpi1$cellID + + neither <- tibble(cellID=colnames(sceall_seurat), + Gata1=sceall_seurat@assays$RNA@data['Gata1',], + Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% + filter(., Gata1 <= thresholdExpr & Sfpi1 <= thresholdExpr) + neither <- neither$cellID + + tmp <- sceall_seurat + tmp$expr_Sfpi1_Gata1 <- NA + tmp@meta.data[bothExpr,]$expr_Sfpi1_Gata1 <- 'both' + tmp@meta.data[neither,]$expr_Sfpi1_Gata1 <- 'neither' + tmp@meta.data[onlyGata1,]$expr_Sfpi1_Gata1 <- 'only_Gata1' + tmp@meta.data[onlySfpi1,]$expr_Sfpi1_Gata1 <- 'only_Sfpi1' + + + table(tmp$expr_Sfpi1_Gata1, tmp$condition) %>% knitr::kable() + table(tmp$expr_Sfpi1_Gata1, tmp$clusterIdents) %>% knitr::kable() + +``` + + + +### Percent of cells *by cluster* across categories of expression + +```{r perc cells by cluster across cat, results=TRUE, fig.width=4.5, fig.height=2} +signif(scale((table(tmp$expr_Sfpi1_Gata1, tmp$clusterIdents)), + scale=as.vector(table(tmp$clusterIdents)), center=FALSE)*100, 3) %>% knitr::kable() + + scale((table(tmp$expr_Sfpi1_Gata1, tmp$clusterIdents)), + scale=as.vector(table(tmp$clusterIdents)), center=FALSE) %>% + reshape2::melt() %>% + mutate(.,expr_cat=Var1) %>% + mutate(cluster=Var2) %>% + ggplot(., aes(x=cluster, y =value, fill=expr_cat)) + + geom_bar(stat = 'identity') #+ + # scale_fill_manual(values = cols_cyclone) + + +``` + +# FACS Plots + + + +```{r} +VlnPlot(object = subset(sceall_seurat, idents = c("EMP", "MP", "GP", "Mac", "Mast")) , features = 'CD16_32_MFI', cols = c("lightgrey", "lightgrey", "lightgrey", "lightgrey", "lightgrey")) +ylab('MFI')+ geom_boxplot(outlier.shape = NA) +theme(legend.position="none") + +``` + +```{r} +FeaturePlot(object = sceall_seurat , reduction = "umap", features = "CD16_32_MFI", pt.size = 1, min.cutoff="q10", max.cutoff="q95", dims = c(1,2), label = FALSE, cols = c("lightgrey", "royalblue4")) + +``` + + + +# Data Storage + +The data with clustering results are saved in `r filesce`. The colors are saved in `r paste0(dirdata,'derived/FL/FL_cols.rds')`. + +```{r} +sceall_seurat <- sceall_seuratbis +saveRDS(sceall_seurat, file = filesce) +saveRDS(cols, file=paste0(dirdata,'derived/FL/FL_cols.rds')) + +``` + + + +