diff --git a/src/FL/FL_03.clustering.Rmd b/src/FL/FL_03.clustering.Rmd
new file mode 100644
index 0000000000000000000000000000000000000000..74eaec6c8a47db57260c712b69facf55587748db
--- /dev/null
+++ b/src/FL/FL_03.clustering.Rmd
@@ -0,0 +1,1009 @@
+---
+title: "Cluster E10.5 and E12.5 Fetal Liver samples"
+author: "Anne Biton"
+date: "`r format(Sys.time(), '%d %B, %Y')`"
+
+output:
+  prettydoc::html_pretty:
+    highlight: github
+    number_sections: yes
+    theme: cayman
+    toc: yes
+---
+
+
+
+
+```{r setup, eval=TRUE, echo=FALSE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE}
+knitr::opts_chunk$set(echo = FALSE, cache = TRUE, warning = FALSE, message=FALSE, results=FALSE, prompt=FALSE, dpi=150)
+
+library(renv)
+library(data.table)
+library(ggplot2)
+library(foreach)
+library(biomaRt)
+library(BiocParallel)
+library(clusterExperiment)
+library(matrixStats)
+library(Seurat)
+library(Matrix)
+library(scater)
+library(scran)
+library(dplyr)
+library(scales)
+library(tidyverse) 
+library(SingleCellExperiment) 
+library(igraph) 
+library(randomcoloR)
+library(batchelor)
+library(openxlsx)
+library(pheatmap)
+
+dirdata <- "../../data/"
+```
+
+
+# Load data
+
+##  Merging first by condition then across conditions
+
+Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. 
+
+
+
+# UMAP plots of the merged dataset
+
+Merging is done using the 3,000 variable features selected above.
+See script `FL_02.merge.Rmd`
+Each condition was merged across runs. Then all conditions were merged in order of E12.5 FL > E10.5 FL. Data available in `data/derived/FL/FL_sceall_seurat_fastMNN_bycond.rds`.
+
+
+
+```{r}
+filesce <- paste0(dirdata,'derived/FL/FL_sceall_seurat_fastMNN_bycond.rds')
+sceall_seurat <- readRDS(filesce)
+```
+
+```{r load FL run123 progenitors and colors}
+
+# load progenitors from run 123 FL (CMP/GMP/MEP/P1)
+cell2progenitor <- readRDS(paste0(dirdata,'derived/FL/FL_cell2progenitor_run123.rds'))
+sceall_seurat$progenitors <- cell2progenitor[colnames(sceall_seurat)]
+
+
+cols_progenitors <- c('CMP' = "#d570fa", 'GMP' = "#4e9dfc", 'MEP' = "#ed5a32", 'P1' = "#eda532", 'NA' = "#ffffff")
+cols_progenitors <- structure(.Data=alpha(cols_progenitors, .6), .Names=names(cols_progenitors))
+
+
+```
+
+
+```{r}
+mart <- useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
+geneAnnot <- biomaRt::getBM(values = rownames(sceall_seurat@assays$RNA_raw@data), 
+                            attributes = c('mgi_symbol', 'mgi_id', 'entrezgene_id','description', 'chromosome_name', 
+                                           'start_position', 'end_position', 'strand', 'gene_biotype'), 
+                            filters = 'mgi_symbol', mart=mart)
+geneAnnot$mgi_id <- paste0('http://www.informatics.jax.org/marker/',geneAnnot$mgi_id)
+# add missing
+geneAnnot <- rbind(geneAnnot,  data.frame( mgi_symbol=setdiff(rownames(sceall_seurat@assays$RNA_raw@data), geneAnnot$mgi_symbol),
+                                           mgi_id=NA, entrezgene_id=NA, description=NA, chromosome_name=NA, start_position=NA, 
+                                           end_position=NA, strand=NA, gene_biotype=NA))
+
+```
+
+
+```{r}
+sceall_seurat <- RunTSNE(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
+sceall_seurat <- RunUMAP(object = sceall_seurat, dims = 1:50, reduction = 'fastMNN', verbose=FALSE)
+```
+
+
+
+```{r, fig.width=10, fig.height=7, dpi=300}
+
+#'E9.5_Yolk_Sac' = "lightpink", 'E10.5_Yolk_Sac' = "goldenrod1", 
+cols_condition <- c("#e6dc85", "#97d2db")
+cols_condition <- structure(.Data=alpha(cols_condition, .3), 
+                            .Names=c('E10.5_Fetal_Liver', 'E12.5_Fetal_Liver'))
+
+
+DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'run', pt.size = 2, dims = c(1,2), label = FALSE)
+DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'plate', pt.size = 2, dims = c(1,2), label = FALSE)
+DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'condition', cols = cols_condition, pt.size = 3, dims = c(1,2), label = FALSE)
+FeaturePlot(object = sceall_seurat , reduction = "umap", features =  'nFeature_RNA', pt.size = 1, dims = c(1,2), label = FALSE)
+FeaturePlot(object = sceall_seurat , reduction = "umap", features =  'percent.mito', pt.size = 2, dims = c(1,2), label = FALSE)
+ 
+cols_clustersrun123 <- c('late_Ery' = "#D9D9D9", 'Mast' = "#FDB462", 'EMP' = "#B3DE69", 'MacP' = "#BEBADA", 'Mk_I' = "#BC80BD", 'mid_Ery' = "#FFFFB3", 'Mk_II' = "#8DD3C7", 'early_Ery' = "#FCCDE5", 'GMP' = "#80B1D3", 'Mk_III' = "#8A8A8A", 'pMac' = "#FB8072")
+cols_clustersrun123 <- structure(.Data=alpha(cols_clustersrun123, .8), .Names=names(cols_clustersrun123))
+
+cols_progenitors <- c('CMP' = "#d570fa", 'GMP' = "#4e9dfc", 'MEP' = "#ed5a32", 'P1' = "#eda532", 'NA'="white")
+cols_progenitors <- structure(.Data=alpha(cols_progenitors, .4), .Names=names(cols_progenitors))
+
+
+#DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'cluster_libraryRun123',  cells = colnames(sceall_seurat)[which(sceall_seurat$cluster_libraryRun123 != 'NA')], cols = cols_clustersrun123, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Cluster Annotations from All Cells Run123bycond "))#,
+
+cols <-  c(cols_condition, cols_clustersrun123, cols_progenitors)
+
+
+# Cells to use
+E10.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>%
+  filter(condition == "E10.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist
+
+E12.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>%
+  filter(condition == "E12.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist
+
+DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'progenitors', cells = E10.5FL_cells, cols = cols_progenitors, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Progenitor Annotations from E10.5 FL Index Sort"))#,
+
+DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'progenitors',cells = E12.5FL_cells, cols = cols_progenitors, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Progenitor Annotations from E12.5 FL Index Sort"))#,
+
+DimPlot(object = sceall_seurat , reduction = "umap", group.by = 'progenitors', cols = cols_progenitors, pt.size = 2, dims = c(1,2), label = FALSE) + ggtitle(paste0("Progenitor Annotations from FL Index Sort"))#,
+
+
+#nFeature with spectral scale
+sceall_seurat[['log2_nCount_RNA']] <- log2(sceall_seurat[['nCount_RNA']])
+ggplot(data = data.frame(UMAP_1 = sceall_seurat@reductions$umap@cell.embeddings[,1],
+                                      UMAP_2 = sceall_seurat@reductions$umap@cell.embeddings[,2],
+                                      log2_nCount_RNA = sceall_seurat[['log2_nCount_RNA']])) + geom_point(aes(UMAP_1, UMAP_2, col = log2_nCount_RNA), size = 2) + theme_classic() + scale_color_gradientn(colors = rev(RColorBrewer::brewer.pal(10, "Spectral")))
+
+
+```
+
+
+
+
+
+# Clustering
+
+
+
+##  Clustering using SNN-graph and random walks using the `scran` and `igraph` packages
+
+A shared nearest-neighbor graph (SNN) is built using the `buildSNNGraph` function of the `scran` package.
+Then communities, which are densely connected subgraphs, are searched for in the SNN graph using the method of random walks implemented in the `cluster_walktrap` function of the `igraph` package.
+See Pascal Pons, Matthieu Latapy: Computing communities in large networks using random walks, http://arxiv.org/abs/physics/0512106.
+
+
+### Cluster modularity
+
+We look at the ratio of the observed and expected edge weights to visualize the modularity of the clusters. 
+
+
+
+
+```{r compute SNN clusters, fig.width=2.5, fig.height=2}
+#We can verify this by clustering on the corrected low-dimensional values using a graph-based method (Xu and Su 2015). 
+
+g10 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 10) 
+sceall_seurat$snn_scran_k10 <-  igraph::cluster_walktrap(g10, steps = 5)$membership
+g11 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 11) 
+sceall_seurat$snn_scran_k11 <-  igraph::cluster_walktrap(g11, steps = 5)$membership
+g12 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 12) 
+sceall_seurat$snn_scran_k12 <-  igraph::cluster_walktrap(g12, steps = 5)$membership
+g13 <- buildSNNGraph(sceall_seurat@reductions$fastMNN@cell.embeddings, d=NA, transposed=TRUE, k = 13) 
+sceall_seurat$snn_scran_k13 <-  igraph::cluster_walktrap(g13, steps = 5)$membership
+
+par(mfrow=c(3,2))
+cluster.mod <- clusterModularity(g10, sceall_seurat$snn_scran_k10, get.weights=TRUE)
+log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1)
+p1 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, 
+    color=colorRampPalette(c("white", "blue"))(100), main = 'k = 10')
+cluster.mod <- clusterModularity(g11, sceall_seurat$snn_scran_k11, get.weights=TRUE)
+log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1)
+p2 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, 
+    color=colorRampPalette(c("white", "blue"))(100), main = 'k = 11')
+cluster.mod <- clusterModularity(g12, sceall_seurat$snn_scran_k12, get.weights=TRUE)
+log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1)
+p3 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, 
+    color=colorRampPalette(c("white", "blue"))(100), main = 'k = 12')
+cluster.mod <- clusterModularity(g13, sceall_seurat$snn_scran_k13, get.weights=TRUE)
+log.ratio <- log2(cluster.mod$observed/cluster.mod$expected + 1)
+p4 <- pheatmap(log.ratio, cluster_rows=FALSE, cluster_cols=FALSE, 
+    color=colorRampPalette(c("white", "blue"))(100), main = 'k = 13')
+
+
+```
+
+
+
+
+```{r plot snn clusters, fig.width=14, fig.height = 12}
+
+ggclusterk10 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k10', cols = 'Set3') + ggtitle('k=10')
+ggclusterk11 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k11', cols = 'Set3') + ggtitle('k=11')
+ggclusterk12 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k12', cols = 'Set3') + ggtitle('k=12')
+ggclusterk13 <- DimPlot(object = sceall_seurat , reduction = "umap", pt.size = 1, dims = c(1,2), group.by = 'snn_scran_k13', cols = 'Set3') + ggtitle('k=13')
+
+multiplot(ggclusterk10, ggclusterk11, ggclusterk12, ggclusterk13, layout=matrix(c(1,2,3,4), nrow=2, byrow=TRUE)) 
+
+```
+
+
+## Clustering obtained with random walks on a SNN graph using $k=12$
+
+We use the clustering obtained with random walks on a SNN graph using $k=12$ nearest neighbors.
+
+```{r clustering k12, fig.width=12, fig.height=10}
+
+# extract colors used for each cluster in the previous plots
+#dcol=unique(ggplot_build(ggclusterk12)$data[[1]][,c('colour','group')]);
+#dcol=dcol[order(dcol$group),]
+#dcol <- structure(.Data = dcol$colour, .Names = dcol$group)#-1)
+# add them to color annotations
+#cols <- dcol
+
+# Rename idents to place cluster number in order from EMP>Mye>Ery>Mk for sake of discussion in the paper
+Idents(sceall_seurat) <- "snn_scran_k12"
+sceall_seurat <- RenameIdents(sceall_seurat, '1'="5", '2'="6", '3'="2", '4'="9", '5'="7", '6'="4", '7'="1", '8'="8", '9'="3", '10'="10")
+
+
+sceall_seurat$snn_scran_k12 <- paste(Idents(sceall_seurat))
+clus2ident <- c('1'="EMP", '2'="MP", '3'="GP",  '4'="Mac", '5'="Mast", '6'='EP', '7'="early_Ery", '8'="mid_Ery", '9'="late_Ery", '10'="MkP")
+
+dircluster <- paste0(dirdata,'derived/FL/cluster_markers/')
+dir.create(dircluster)
+
+clusteringchoice <- 'snn_scran_k12'
+filelabel <- "snnRandomWalksClustering_k12"
+plottitle <- 'k=12'
+
+Idents(sceall_seurat) <-clus2ident[as.character(sceall_seurat[[clusteringchoice]][,1])]
+Idents(sceall_seurat) <- factor(Idents(sceall_seurat), levels = c("EMP", "MP", "GP", "Mac", "Mast", "EP", "early_Ery", "mid_Ery", "late_Ery", "MkP"))
+sceall_seurat$clusterIdents <- sceall_seurat$knn_clusters <- Idents(sceall_seurat)
+
+#set cluster colors
+cols_knn_clusters <- c('EMP' = "#B3DE69", 'MP' = "#BEBADA", 'GP' = "#80B1D3", 'Mac' = "#FB8072", 'Mast' = "#FDB462",     'EP' = "#8DD3C7", 'early_Ery' = "#FCCDE5", 'mid_Ery' = "#FFFFB3", 'late_Ery' = "#D9D9D9", 'MkP' = "#BC80BD")
+cols_knn_clusters <- structure(.Data=alpha(cols_knn_clusters, .8), .Names=names(cols_knn_clusters))
+
+cols_snn_scran12 <- c('1' = "#B3DE69", '2' = "#BEBADA", '3' = "#80B1D3", '4' = "#FB8072", '5' = "#FDB462", '6' = "#8DD3C7", '7' = "#FCCDE5", '8' = "#FFFFB3", '9' = "#D9D9D9", '10' = "#BC80BD")
+cols_snn_scran12 <- structure(.Data=alpha(cols_snn_scran12, .8), .Names=names(cols_snn_scran12))
+
+
+cols_knn_clusters2 <- c('EMP' = "#e6c9ff", 'MP' = "#8a90ff", 'GP' = "#8fc1ff", 'Mac' = "#323e99", 'Mast' = "#f7c868", 'EP' = "#ffc9ba", 'early_Ery' = "#ff8969", 'mid_Ery' = "#fa435f", 'late_Ery' = "#ad0720", 'MkP' = "#fa7602")
+cols_knn_clusters2 <- structure(.Data=alpha(cols_knn_clusters2, .6), .Names=names(cols_knn_clusters2))
+
+
+
+cols <- c(cols, cols_knn_clusters)
+
+sceall <- as.SingleCellExperiment(sceall_seurat)
+
+sceall_seurat_RNAraw <- sceall_seurat;
+sceall_seurat_RNAraw@active.assay <- 'RNA_raw'
+sceall_RNAraw <- as.SingleCellExperiment(sceall_seurat, assay = 'RNA_raw')
+
+```
+
+ 
+```{r plot UMAP of picked clustering, fig.width=10, fig.height=7, dpi=600}
+
+
+# plot clusters with legend
+# reorder legend
+sceall_seurat$knn_clusters <- factor(sceall_seurat$knn_clusters, levels=c("EMP", "MP", "GP", "Mac", "Mast", "EP", "early_Ery", "mid_Ery", "late_Ery", "MkP"))
+
+DimPlot(sceall_seurat, reduction = "umap", group.by = "knn_clusters", pt.size = 1.5, label = FALSE, label.size=6, cols=cols_knn_clusters2) + ggtitle(paste0("UMAP : Random Walk Clustering, ", plottitle))
+
+DimPlot(sceall_seurat, reduction = "umap", group.by = "knn_clusters", pt.size = 1.5, label = TRUE, label.size=6, cols=cols_knn_clusters2) + ggtitle(paste0("UMAP : Random Walk Clustering, ", plottitle))
+```
+
+```{r plot UMAP of FACS assigned gates, fig.width=10, fig.height=7, dpi=600}
+
+
+DimPlot(sceall_seurat, reduction = "umap", group.by = "progenitors", pt.size = 1.5, label = FALSE,  cols=cols_progenitors) + ggtitle(paste0("UMAP :", 'progenitor gate'))
+
+```
+
+
+```{r plot interactive UMAP plot of picked clustering, fig.width=10, fig.height=7, results=TRUE}
+library(plotly)
+g <- ggplot(as.data.table(cbind(sceall_seurat@reductions$umap@cell.embeddings, sceall_seurat@meta.data))) + geom_point(aes(x=UMAP_1, y=UMAP_2, color=clusterIdents, text=paste("cell:", cellID, "\nrun:", run)), size=.9) + scale_color_manual(values =cols_knn_clusters)
+ggplotly(g)
+```
+
+
+```{r function to summarize in which clusters a marker is found to be significant}
+# for each gene, extract the number of clusters in which it is detected as a marker
+
+summarizeMarkerSpecificity <- function(markers.out, fdr_cutoff) {
+
+  markers_sel <- lapply(markers.out,
+                        function(x, fdr_cutoff)
+                          subset(x, FDR < fdr_cutoff), fdr_cutoff=fdr_cutoff)
+  allmarkergenes <- unique(unlist(lapply(markers_sel, rownames)))
+
+  ismarker <- mapply(x=markers_sel,
+                     clus=names(markers_sel),
+                     function(x,clus) {
+                       xx <- x[allmarkergenes,c('FDR'),drop=F]
+                       #colnames(xx) <- paste0(clus,'FDR')
+                       rownames(xx) <- allmarkergenes
+                       return(xx)
+                     })
+
+  ismarker <- do.call(cbind, ismarker)
+  ismarker_boolean <- as.matrix(ismarker) < fdr_cutoff
+  ismarker_boolean[is.na(ismarker_boolean)] <- FALSE
+  ismarker_boolean <- as.data.table(ismarker_boolean)
+  ismarker_boolean[, nbClusters := rowSums(ismarker_boolean)]
+  ismarker_boolean[, gene := rownames(ismarker)]
+  ismarker_boolean <- ismarker_boolean[order(ismarker_boolean$nbClusters),]
+  ismarker_boolean <- ismarker_boolean[, c("gene", 'nbClusters',setdiff(colnames(ismarker_boolean), c('gene','nbClusters'))), with=FALSE]
+  ismarker_boolean[, whichClusters := paste0(sort(setdiff(colnames(ismarker_boolean), c('gene','nbClusters'))[which(unlist(.SD) == "TRUE")]), collapse=','), .SDcols = setdiff(colnames(ismarker_boolean), c('gene','nbClusters')), by = gene]
+
+  ismarker_info <- cbind(ismarker_boolean[, c('gene','nbClusters', 'whichClusters')], ismarker[match(ismarker_boolean$gene, rownames(ismarker)),])
+
+  ismarker_info$whichClusters <- gsub('.FDR', '', ismarker_info$whichClusters)
+
+  return(ismarker_info)
+
+
+}
+
+```
+
+
+
+### Cluster markers using pairwise $t$-tests and keeping only markers differential in at least half of the pairwise comparisons between clusters
+
+#### Using the dataset post-filtering 
+
+
+Markers of the clusters are obtained using the function `findMarkers` (which by default performs a t-test) and are written in the folder `/data/derived/FL/cluster_markers/`.   
+In this version, we keep genes that are found to be differential in at least half of the pairwise comparisons between clusters, i.e. the combined p-value across comparisons is chosen to be the "middle-most"  p-value of all tests (cluster comparisons) performed for a given gene. This is achieved using the option  `pval.type="some"` in the `findMarkers` function.     
+Results are available in the file `r paste0(dircluster,'FL_markersDiffInHalfComparisons_',filelabel,'.xlsx')`.
+
+
+```{r find half specific markers with findMarkers}
+
+fdr_threshold <-  0.01
+
+# write markers without using run as a blocking variable
+markers.out <- findMarkers(sceall, sceall[[clusteringchoice]], #block=sceall$groupForDoubletDetection,#snn_scran_k15
+    direction="up", assay.type="logcounts", pval.type = 'some')
+
+names(markers.out) <-clus2ident[as.character(names(markers.out))]
+
+markers.out <- lapply(markers.out, function(x) {
+  colnames(x) <- bayesbio:::mgsub(pattern=paste0('logFC.',1:length(unique(sceall_seurat[[clusteringchoice]][,1]))), 
+                                  replacement = paste0('logFC.', clus2ident), 
+                                  colnames(x)); 
+  x})
+
+markers_spe <- summarizeMarkerSpecificity(markers.out, fdr_cutoff =  fdr_threshold)
+markers_spe <- cbind(markers_spe, geneAnnot[match(markers_spe$gene,geneAnnot$mgi_symbol),])
+
+c(list(summary=as.data.frame(markers_spe)),lapply(markers.out, function(x) as.data.frame(x[x$FDR < fdr_threshold,]) %>%          cbind(gene=rownames(.), geneAnnot[match(rownames(.),geneAnnot$mgi_symbol),], .))) %>% 
+  openxlsx::write.xlsx(., file = paste0(dircluster,'FL_markersDiffInHalfComparisons_',filelabel,'.xlsx'), asTable = TRUE, row.names = TRUE) 
+
+```
+
+
+
+```{r upset plots, fig.width=8, fig.height=6}
+# look at intersection between genes associated with the clusters using an Upset plot
+
+forupset <- do.call(cbind,lapply(markers.out, function(x) as.integer(rownames(sceall) %in% rownames(x)[x$FDR < fdr_threshold & !is.na(x$FDR)])))
+rownames(forupset) <- rownames(sceall)
+# order clusters
+forupset <- forupset[,levels(sceall$clusterIdents)]
+UpSetR::upset(data=data.frame(forupset),  
+              line.size = 0.3, point.size = 2, 
+              sets = levels(sceall$clusterIdents),
+              keep.order = TRUE,
+              order.by = c('freq'),
+              sets.x.label = '#markers',
+              number.angles	= 0) #
+
+
+
+```
+
+```{r count half-spe markers diff in all comparisons, results='asis'}
+data.frame(nbDiffInHalfComparisons=sapply(markers.out, function(x) sum(x$FDR < fdr_threshold))) %>% knitr::kable() %>% print()
+```
+
+
+The markers in the top 5 for each cluster are shown in the heatmap plot.
+
+```{r heatmap plot half specific markers, fig.width=12, fig.height=8}
+sceall_seurat <- ScaleData(sceall_seurat)
+
+DoHeatmap(sceall_seurat, group.by = 'knn_clusters', features = unique(unlist(lapply(markers.out, function(x) rownames(x[x$FDR < .01,])[1:5]))), group.colors = cols_knn_clusters2) + ggtitle(plottitle) + 
+    theme(text = element_text(size = 16)) + 
+    scale_fill_gradientn(colors = colorRampPalette(c("#033080", "#ffffe3", "#edea13"))(256))
+
+```
+
+
+UMAP plot of the top 5 markers for each cluster is shown below.
+
+```{r feature plot top middle specific markers, fig.width=18, fig.height=20}
+FeaturePlot(sceall_seurat,  features = unique(unlist(lapply(markers.out, function(x) rownames(x[x$FDR < .01,])[1:5]))), ncol = 6, pt.size=0.5)
+
+```
+
+
+
+#### Enrichment analysis of the cluster markers
+
+Using the genes differential between each cluster and at least half of the other clusters (with FDR < 1%), we perform an enrichment/over-representation analysis using the MSigDB (http://software.broadinstitute.org/gsea/msigdb) and the GeneSetDB databases (https://www.genesetdb.auckland.ac.nz/haeremai.html).  The number of top genes used can be adapted depending on the obtained results.
+
+The top 10 enriched gene sets of a subset of collections of gene sets are shown below.
+The complete results are stored in the folders`data/derived/FL/cluster_markers/ORA_pathways_MsigDB/`
+
+
+```{r pathway analysis}
+
+library(EGSEA)
+library(openxlsx)
+
+
+# using MSigDB
+resora <- foreach(res = markers.out,
+                  clusterind = names(markers.out)) %do% {
+                    
+                    
+                    #geneids <- rownames(res[res$Top %in% 1:150 & res$FDR < .001,])
+                    geneids <- rownames(res[res$FDR < .01,])
+                    
+                    if (length(geneids) > 0) {
+                      
+                      logfc <- structure(#.Data = rowMeans(as.matrix(res[geneids,-c(1:3)]), na.rm = TRUE), 
+                                         .Data = apply(as.matrix(res[geneids,-c(1:3)]), 1, function(x) x[which.max(abs(x))]), 
+                                         .Names = geneids)
+                      
+                      entrezids <- as.character(geneAnnot[match(tolower(geneids), tolower(geneAnnot$mgi_symbol)),]$entrezgene_id   )
+                      logfc <-  logfc[!is.na(entrezids)]
+                      names(logfc) <- entrezids[!is.na(entrezids)]
+                      
+                      genes <- geneAnnot[match(tolower(rownames(res)),tolower(geneAnnot$mgi_symbol)), c('entrezgene_id', 'mgi_symbol')]
+                      colnames(genes) <- c('FeatureID', 'Symbols')
+                      genes <- na.omit(genes)
+                      
+                      gs.annots_ora = buildIdx(entrezIDs = names(logfc), species = "mouse", gsdb.gsets = "all", kegg.exclude = TRUE)
+                      ora = egsea.ora(geneIDs=names(logfc),
+                                      universe= na.omit(geneAnnot$entrezgene_id), 
+                                      logFC =as.matrix(logfc),
+                                      title=clusterind,  
+                                      gs.annots=gs.annots_ora, 
+                                      symbolsMap=genes[match(names(logfc), genes$FeatureID),], 
+                                      num.threads = 2, 
+                                      report = F,
+                                      report.dir = '')
+                    }
+                    else {
+                      ora <- NA
+                    }
+                    return(ora)
+                    
+                  }
+                  
+names(resora) = names(markers.out)
+
+# write in excel tables
+dir.create(paste0(dircluster,'ORA_pathways_MsigDB/'))
+foreach( comp = names(resora)) %do% {
+    getres <- mapply(res = resora[[comp]]@results, 
+           degenes = resora[[comp]]@gs.annots, 
+           function(res, degenes) {
+             res$test.results[[1]] <- subset(res$test.results[[1]], p.adj < .25)
+             if(nrow(res$test.results[[1]]) > 100) res$test.results[[1]] <- res$test.results[[1]][1:100,]
+             cbind(res$test.results[[1]], 
+                   degenes@anno[match(rownames(res$test.results[[1]]), rownames(degenes@anno)),],
+                   genes=sapply(degenes@idx[match(rownames(res$test.results[[1]]), names(degenes@idx))], 
+                          function(x, degenes) paste(geneAnnot$mgi_symbol[match(degenes@featureIDs[x],geneAnnot$entrezgene_id)], collapse=',')
+                   , degenes = degenes))
+           },  SIMPLIFY = FALSE)
+ 
+    
+    #unlist(lapply(getres, function(x) sum(x$p.adj < .1)))
+    openxlsx::write.xlsx(getres, file = paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), row.names=TRUE)
+}
+
+
+```
+
+
+```{r pathway ORA top 10 for half specific markers, results='asis'}
+
+topora <- foreach (comp = names(markers.out)) %do% {
+
+  reg <- readxl::read_xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), 
+                           sheet = 'gsdbreg')
+  go <- readxl::read_xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), 
+                          sheet = 'gsdbgo')
+  path <- readxl::read_xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), 
+                            sheet = 'gsdbpath')
+  
+  reg <- subset(reg, p.adj < .05)
+  go <- subset(go, p.adj < .05)
+  path <- subset(path, p.adj < .05)
+  
+  cat(paste0('\n\n##### ',comp, '   \n\n') )
+  #%>% knitr::kable() %>% print()
+  reg[1:min(10,nrow(reg)),c("GeneSet", "p.adj", "NumGenes", "genes")]  %>% knitr::kable() %>% print()
+  go[1:min(10,nrow(go)),c("GeneSet", "p.adj", "NumGenes", "genes")]  %>% knitr::kable() %>% print()
+  path[1:min(10,nrow(path)),c("GeneSet", "p.adj", "NumGenes", "genes")] %>% knitr::kable() %>% print()
+}
+```
+
+
+```{r plot ORA results 1, fig.width=16, fig.height=16}
+# plot the results of over-enrichment analysis
+# returns a list containing a ggplot object and the table used to make the plot
+plotORA<- function(cluster, # cluster label
+                   resora, # table containing results of the ORA analysis
+                   padj_threshold, 
+                   fileplot=NULL, 
+                   drawplot=FALSE, # should the plot be drawn for each cluster
+                   facet_ncol=1, maxGeneSetsShown = 10,
+                   selcat=c("c2", "c3", "c4", "c5", "c6", "gsdbdrug", "gsdbpath", "gsdbdis",  "gsdbgo", "kegg"), ...) {
+  
+  getres <- lapply(resora[selcat], 
+                   function(res) {
+                     if (nrow(res) >0 )  {
+                       res <- subset(res, p.adj< padj_threshold)
+                       if (nrow(res) > maxGeneSetsShown)
+                         res <- res[1:maxGeneSetsShown,]
+                       return(res)
+                     }
+                   })
+  
+  # remove categories with no results
+  ind <- which(sapply(getres, function(x) if (!is.null(x)) nrow(x) else 0)>0)
+  
+  if (length(ind) > 0) getres <- getres[ind, drop=FALSE] else next
+  
+  # add category to results before merging
+  getres <- mapply(gr=getres,ngr=names(getres), function(gr,ngr) {gr$category <- ngr; gr}, SIMPLIFY = FALSE)
+  cn <- sapply(getres, function(x) {colnames(x)})
+  cn <- setdiff(names(which(table(unlist(cn)) == max(table(unlist(cn))))), "")
+  cn <- c('GeneSet', setdiff(cn, "GeneSet"))
+  # aggregate results into one table
+  getresmergesel <- do.call(rbind, lapply(getres, function(x) x[,cn, drop=FALSE]))
+  getresmergesel$cluster <- cluster
+  
+  getresmergesel  %>% .[order(.$category,.$p.adj, decreasing = TRUE),] 
+  getresmergesel$GeneSet <- toupper(getresmergesel$GeneSet)
+  getresmergesel$GeneSet <- factor(getresmergesel$GeneSet, levels = unique(getresmergesel$GeneSet))
+  getresmergesel$RatioGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1])/as.numeric(x[2]))
+  getresmergesel$NbGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1]))
+  
+  if (nrow(getresmergesel) > 0) {
+  gg <- ggplot(getresmergesel, 
+               aes(y=-log10(p.adj), x=GeneSet, fill=RatioGenes)) + 
+    ggforce::facet_col(~category, scales = 'free_y',  space='free') +
+    #facet_wrap(~category, scales = 'free')+
+    coord_flip() + 
+    xlab("") + ylab("-log10(adj.pval)") + ggtitle(cluster)+
+    theme_bw()+ 
+    geom_bar(stat='identity', position="dodge")  + coord_flip()  
+  
+  if (drawplot) {
+    if (!is.null(fileplot)) pdf(fileplot, ...)
+    print(gg)
+    if (!is.null(fileplot)) dev.off()
+  }
+  
+  return(list(plot=gg, dataplot=getresmergesel))
+  }
+}
+
+
+readAllORA<- function(resora_files,
+                      resora_names,
+                   padj_threshold, 
+                   maxGeneSetsShown = NULL) {
+  
+  # for each cluster, read all gene set results 
+  allresora <- 
+    mapply(resora_file = resora_files, 
+         resora_name = resora_names,
+         function(resora_file, resora_name) {
+    genesets <- openxlsx::getSheetNames(resora_file)
+    getres <- sapply(genesets, function(gs) openxlsx::read.xlsx(resora_file, sheet = gs), simplify = FALSE)
+    
+    # remove categories with no results
+    ind <- which(sapply(getres, function(x) if (!is.null(x)) nrow(x) else 0)>0)
+    if (length(ind) > 0) getres <- getres[ind, drop=FALSE] else return(NULL)
+ 
+    # add category to results before merging
+    getres <- mapply(gr=getres,ngr=names(getres), 
+                     function(gr,ngr) {
+                       gr$category <- ngr; 
+                       gr$rank <- order(gr$p.value)
+                       gr
+                       }, SIMPLIFY = FALSE)
+    cn <- sapply(getres, function(x) {colnames(x)})
+    cn <- setdiff(names(which(table(unlist(cn)) == max(table(unlist(cn))))), "")
+    cn <- c('GeneSet', setdiff(cn, "GeneSet"))
+    # aggregate results into one table
+    getresmergesel <- do.call(rbind, lapply(getres, function(x) x[,cn, drop=FALSE]))
+    getresmergesel$cluster <- resora_name
+
+    getresmergesel  %>% .[order(.$category,.$p.adj, decreasing = TRUE),] #"gsdbgo","gsdbpath", "c2"
+    getresmergesel$GeneSet <- toupper(getresmergesel$GeneSet)
+    getresmergesel$GeneSet <- factor(getresmergesel$GeneSet, levels = unique(getresmergesel$GeneSet))
+    getresmergesel$RatioGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1])/as.numeric(x[2]))
+    getresmergesel$NbGenes <- sapply(strsplit(getresmergesel$NumGenes, split='/'), function(x) as.numeric(x[1]))
+      
+    return(getresmergesel)
+    
+  }, SIMPLIFY = FALSE)
+
+  # merge results from all clusters in one data.frame
+  allresora <- do.call(rbind, allresora)
+  
+  # select gene sets significant in at least one cluster
+  if (is.null(maxGeneSetsShown))   {
+    signgs <- unique(allresora[allresora$p.adj < padj_threshold,]$GeneSet)
+  } else {
+    signgs <- unique(allresora[allresora$p.adj < padj_threshold & allresora$rank %in% 1:maxGeneSetsShown,]$GeneSet)
+
+    }
+  
+
+    
+  
+  allresora <- allresora[allresora$GeneSet %in% signgs,]
+  
+  return(allresora)
+  
+}
+
+padj_threshold <- 0.05
+maxGeneSetsShown <- 15
+
+
+```
+
+*  The plots of the top `r maxGeneSetsShown` or all gene sets with adjusted p-value lower than `r padj_threshold` are saved in in `r paste0(dircluster,"/ORA_pathways_MsigDB/plots/")`.
+
+
+
+```{r plot ORA results 2, fig.width=8, fig.height=12}
+
+dir.create(paste0(dircluster,"ORA_pathways_MsigDB/plots/"))
+
+padj_threshold <- 0.05
+maxGeneSetsShown <- 15
+
+# this will build one plot per cluster for all selected categories of gene sets 
+plotperclus <- foreach (comp = levels(Idents(sceall_seurat))) %do% {
+  
+  genesets <- openxlsx::getSheetNames(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"))
+
+  resora <- sapply(genesets, function(gs) openxlsx::read.xlsx(paste0(dircluster,"ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',comp),".xlsx"), sheet = gs))
+  
+gora <-  plotORA(cluster = comp, 
+         resora = resora, 
+         padj_threshold =  padj_threshold, 
+         fileplot = NULL,
+         drawplot= FALSE,#paste0("plots/voomWithQualityWeight_withflowcell_ORA_",region,'.pdf'), 
+          facet_ncol = 2, 
+          maxGeneSetsShown = maxGeneSetsShown,
+          selcat=c("gsdbpath", "gsdbreg", "gsdbgo", "kegg"),
+          width=20, height=15)
+#selcat = c("gsdbpath", "c5", "kegg"),
+return(gora)  
+}
+
+# plot them all individually, grouped by cluster
+#do.call(gridExtra::grid.arrange, c(lapply(plotperclus, function(x) x$plot), nrow=4))
+
+# ==== Plot by category ====
+# we merge the results tables for each cluster into one big table such that we can plot all clusters on a same plot 
+intcol <- names(which(table(unlist(sapply(plotperclus, function(x) colnames(x$dataplot)))) == length(plotperclus)))
+alloraressel <- do.call(rbind, lapply(plotperclus, function(x) x$dataplot[,intcol]))
+
+alloraressel$cluster <- factor(alloraressel$cluster, levels = levels(Idents(sceall_seurat)))
+  
+alloraressel$GeneSet <- as.character(alloraressel$GeneSet)
+
+foreach (catsel=c("gsdbpath", "gsdbreg", "gsdbgo", "kegg"),
+         width = c(12, 6, 12, 8),
+         height = c(23, 10, 23, 20)
+         )  %do% {
+catplot <- 
+  alloraressel[alloraressel$category == catsel,] %>% 
+  group_by(cluster, category) %>%
+  arrange(desc(-1/RatioGenes)) %>% 
+  ungroup() %>%
+  mutate( GeneSet = factor(paste(GeneSet, cluster, sep='__'),
+                           levels = rev(paste(GeneSet, cluster, sep='__')))) %>%
+  ggplot(aes(y=RatioGenes, x=GeneSet, 
+             fill=-log10(p.adj),
+  )) + 
+  theme_bw() +
+  geom_bar(stat='identity', position="dodge") + 
+  ggforce::facet_col(~cluster,  scales = 'free_y', space = 'free') + 
+  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
+  coord_flip() + ylab("% of genes from gene set that are DE") + 
+  ggtitle(label = catsel, subtitle = paste0(" adjusted p-value < ", padj_threshold, ", top ", maxGeneSetsShown))
+  
+ if (catsel == 'gsdbreg') print(catplot)
+
+ggsave(plot = catplot, 
+       filename =  paste0(dircluster,"ORA_pathways_MsigDB/plots/", catsel, "_orderByGeneRatio_padj", padj_threshold,"_", "top", maxGeneSetsShown, ".pdf"),
+       width = width, height = height, dpi = 300)
+
+
+}
+```
+
+
+```{r plot ORA results as circles, fig.width=9, fig.height=6}
+
+dir.create(paste0(dircluster,"ORA_pathways_MsigDB/plots/"))
+
+
+
+# ==== Plot showing all clusters at once  ====
+
+#padj_threshold <- 0.05
+maxGeneSetsShown <- 15
+# maxGeneSetsShown <- 10
+cat2plot <- c("gsdbpath", "gsdbreg", "gsdbgo", "kegg")
+
+## === showing top x (maxGeneSetsShown) only === 
+resallora <- readAllORA(resora_files = paste0(dircluster,"/ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',names(markers.out)),".xlsx"), 
+           resora_names = names(markers.out),
+           padj_threshold = padj_threshold, 
+           maxGeneSetsShown = maxGeneSetsShown)
+
+resallora$cluster <- factor(resallora$cluster, levels = levels(Idents(sceall_seurat)))
+
+foreach (catsel=cat2plot)  %do% {
+  catplot <-
+    resallora %>%
+    filter (category == catsel  & p.adj < padj_threshold) %>%
+    group_by(cluster) %>%
+    arrange(cluster, p.value) %>%
+    ungroup() %>%
+    mutate( GeneSet = factor(GeneSet,
+                             levels = rev(unique(GeneSet)))) %>%
+    ggplot(aes(y=cluster, x=GeneSet)) +
+    theme_bw() +
+    geom_point(aes(size=RatioGenes, colour = -log10(p.adj))) +
+    scale_x_discrete(labels = function(x) substr(gsub("__.+$", "", x), start = 1, stop = 80)) +
+    coord_flip() +
+    ggtitle(label = catsel, subtitle = paste0(" adjusted p-value < ", padj_threshold, ", top ", maxGeneSetsShown)) + ylab('')
+  
+  ggsave(plot = catplot, 
+       filename =  paste0(dircluster,"/ORA_pathways_MsigDB/plots/", catsel, "_circles_padj", padj_threshold,"_", "top", maxGeneSetsShown, '.pdf'),
+       width = if (catsel %in% c('gsdbgo','gsdbpath')) 15 else 10, height = if (catsel %in% c('gsdbgo','gsdbpath')) 13 else 7, dpi = 300)
+
+  
+
+}
+
+
+## === showing  all below a certain adjusted p-value threshold === 
+resallora <- readAllORA(resora_files = paste0(dircluster,"/ORA_pathways_MsigDB/FL_fastMNN_markersDiffInHalfComparisons_ORA_", filelabel, "_", gsub(' ','',names(markers.out)),".xlsx"), 
+           resora_names = names(markers.out),
+           padj_threshold = padj_threshold, 
+           maxGeneSetsShown = NULL)
+
+resallora$cluster <- factor(resallora$cluster, levels = levels(Idents(sceall_seurat)))
+
+foreach (catsel=cat2plot)  %do% {
+  catplot <-
+    resallora %>%
+    filter (category == catsel  & p.adj < padj_threshold) %>%
+    group_by(cluster) %>%
+    arrange(cluster, p.value) %>%
+    ungroup() %>%
+    mutate( GeneSet = factor(GeneSet,
+                             levels = rev(unique(GeneSet)))) %>%
+    ggplot(aes(y=cluster, x=GeneSet)) +
+    theme_bw() +
+    geom_point(aes(size=RatioGenes, colour = -log10(p.adj))) +
+    scale_x_discrete(labels = function(x) substr(gsub("__.+$", "", x), start = 1, stop = 80)) +
+    coord_flip() +
+    ggtitle(label = catsel, subtitle = paste0(" adjusted p-value < ", padj_threshold)) + ylab('')
+  
+  ggsave(plot = catplot, 
+       filename =  paste0(dircluster,"/ORA_pathways_MsigDB/plots/", catsel, "_circles_padj", padj_threshold,"_all.pdf"),
+       width = if (catsel %in% c('gsdbgo','gsdbpath')) 20 else 10, height = if (catsel %in% c('gsdbgo','gsdbpath')) 20 else 6, dpi = 300)
+
+  
+
+}
+
+
+
+
+```
+
+
+
+# Cell cycle quantification
+
+Using the `cyclone` function of the `scran` package.
+
+
+```{r cyclone on snn random clusters, fig.width=8, fig.height=8}
+
+library(org.Mm.eg.db)
+ensembl <- mapIds(org.Mm.eg.db, keys=rownames(sceall_seurat), keytype="SYMBOL", column="ENSEMBL")
+
+mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", 
+    package="scran"))
+
+assignments <- cyclone(as.SingleCellExperiment(sceall_seurat), mm.pairs, gene.names=ensembl, assay.type="logcounts")
+
+
+#
+sceall_seuratbis <- sceall_seurat
+sceall_seuratbis$G1 <- assignments$score$G1
+sceall_seuratbis$G2M <- assignments$score$G2M
+sceall_seuratbis$S <- assignments$score$S
+sceall_seuratbis$cyclone_cellcycle <- assignments$phases
+
+
+```
+
+```{r vln plots cyclone by cluster, fig.width=8, fig.height=8}
+
+vs1 <- VlnPlot(object = sceall_seuratbis , features = 'G1', group.by = 'knn_clusters', cols=cols_knn_clusters)+ggtitle('G1')+xlab('')+ylab('G1 Phase Score')+ geom_boxplot(outlier.shape = NA)+theme(legend.position="none")
+vs3 <- VlnPlot(object = sceall_seuratbis , features = 'G2M', group.by = 'knn_clusters', cols=cols_knn_clusters)+ggtitle('G2M')+xlab('')+ylab('G2M Phase Score')+ geom_boxplot(outlier.shape = NA)+theme(legend.position="none")
+vs2 <- VlnPlot(object = sceall_seuratbis , features = 'S', group.by = 'knn_clusters', cols=cols_knn_clusters)+ggtitle('S')+xlab('')+ylab('S Phase Score')+ geom_boxplot(outlier.shape = NA)+theme(legend.position="none")
+ print(gridExtra::marrangeGrob(list(vs1,vs2,vs3),ncol = 1, nrow=3, top='cyclone cell cycle Phase score vs. clusters for k=12'))
+
+
+```
+
+
+```{r color umap by cyclone cell cycle stage, fig.width=7, fig.height=6}
+
+cols_cyclone <- c('G1' = "#FB8072", 'G2M' = "#B3DE69", 'S' = "#FDD662")
+
+d12umap_cyclone <- DimPlot(object = sceall_seuratbis , reduction = "umap", group.by = 'cyclone_cellcycle', cols=cols_cyclone, pt.size = 1, dims = c(1,2)) + 
+  ggtitle('') #Colored by Cell cycle phase
+
+
+print(gridExtra::marrangeGrob(list(d12umap_cyclone),ncol = 1, nrow=1,  #,d12tsne_cyclone
+                              top='Assignment of cell cycle Phase made by cyclone'))
+
+
+sceall_seurat <- sceall_seuratbis
+
+```
+
+
+
+# Proportion of Condition, Cell Cycle Phase, and Progenitor gates  among Clusters
+
+
+```{r proportion cell cycle phase among clusters, echo=F, warning=F, message=F, results=T, fig.height=2, fig.width=5}
+
+#cyckone cell cycle phase among clusters
+scale(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters), scale=colSums(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters)), center = FALSE) %>% reshape2::melt() %>% mutate(.,cellcycle=Var1) %>% mutate(cluster=Var2) %>% ggplot(., aes(x=cluster, y =value, fill=cellcycle)) + geom_bar(stat = 'identity') + scale_fill_manual(values = cols_cyclone)
+
+
+scale(table(sceall_seurat$condition, sceall_seurat$knn_clusters), scale=colSums(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters)), center = FALSE) %>% reshape2::melt() %>% mutate(.,condition=Var1) %>% mutate(cluster=Var2) %>% ggplot(., aes(x=cluster, y =value, fill=condition)) + geom_bar(stat = 'identity') + scale_fill_manual(values = c('E10.5_Fetal_Liver'="#e6dc85", 'E12.5_Fetal_Liver'="#97d2db"))
+
+
+scale(table(sceall_seurat$condition, sceall_seurat$knn_clusters), scale=colSums(table(sceall_seurat$cyclone_cellcycle, sceall_seurat$knn_clusters)), center = FALSE) %>% reshape2::melt() %>% mutate(.,condition=Var1) %>% mutate(cluster=Var2) %>% ggplot(., aes(x=cluster, y =value, fill=condition)) + geom_bar(stat = 'identity') + scale_fill_manual(values = c('E10.5_Fetal_Liver'="#e6dc85", 'E12.5_Fetal_Liver'="#97d2db"))
+
+scale(table(sceall_seurat$knn_clusters, sceall_seurat$progenitors), scale=colSums(table(sceall_seurat$knn_clusters, sceall_seurat$progenitors)), center = FALSE) %>% reshape2::melt() %>% mutate(cluster=Var1) %>% mutate(progenitor=Var2) %>% ggplot(., aes(x=progenitor, y =value, fill=cluster)) + geom_bar(stat = 'identity') + scale_fill_manual(values = cols_knn_clusters)
+
+
+```
+
+## Gata1 and Sfpi1 expression
+
+
+Cells are ordered with respect to Gata1 expression. 
+
+```{r, fig.width=5, fig.height=4}
+# Cells to use
+E10.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>%
+  filter(condition == "E10.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist
+
+E12.5FL_cells <-sceall_seurat@meta.data %>% tibble::rownames_to_column('cell') %>%
+  filter(condition == "E12.5_Fetal_Liver") %>% dplyr::select('cell') %>% unlist
+
+barplot(sceall_seurat@assays$RNA@data['Gata1',][order(sceall_seurat@assays$RNA@data['Gata1',])],  col = alpha('turquoise',.5),  border = NA, names.arg = FALSE)
+barplot(sceall_seurat@assays$RNA@data['Sfpi1',][order(sceall_seurat@assays$RNA@data['Gata1',])],  col = alpha('blue',.7),  border = NA, names.arg = FALSE, add=TRUE)
+legend('top',legend=c('Gata1','Spfi1'), pch=16, col=c('turquoise', 'blue'))
+
+
+```
+
+### Number of cells expressing both, neither, or only one of Gata1 and Sfpi1
+
+```{r}
+thresholdExpr <- 1
+```
+
+```{r build expr categories, results=TRUE}
+
+  bothExpr <- tibble(cellID=colnames(sceall_seurat),
+                     Gata1=sceall_seurat@assays$RNA@data['Gata1',],
+                     Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% 
+    filter(., Gata1 > thresholdExpr & Sfpi1 > thresholdExpr)
+  bothExpr <- bothExpr$cellID
+  
+  onlyGata1 <- tibble(cellID=colnames(sceall_seurat),
+                     Gata1=sceall_seurat@assays$RNA@data['Gata1',],
+                     Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% 
+    filter(., Gata1 > thresholdExpr & Sfpi1 <= thresholdExpr)
+  onlyGata1 <- onlyGata1$cellID
+
+    onlySfpi1 <- tibble(cellID=colnames(sceall_seurat),
+                     Gata1=sceall_seurat@assays$RNA@data['Gata1',],
+                     Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% 
+    filter(., Gata1 <= thresholdExpr & Sfpi1 > thresholdExpr)
+  onlySfpi1 <- onlySfpi1$cellID
+
+     neither <- tibble(cellID=colnames(sceall_seurat),
+                     Gata1=sceall_seurat@assays$RNA@data['Gata1',],
+                     Sfpi1=sceall_seurat@assays$RNA@data['Sfpi1',]) %>% 
+    filter(., Gata1 <= thresholdExpr & Sfpi1 <= thresholdExpr)
+  neither <- neither$cellID
+  
+  tmp <- sceall_seurat
+  tmp$expr_Sfpi1_Gata1 <- NA
+  tmp@meta.data[bothExpr,]$expr_Sfpi1_Gata1 <- 'both'
+  tmp@meta.data[neither,]$expr_Sfpi1_Gata1 <- 'neither'
+  tmp@meta.data[onlyGata1,]$expr_Sfpi1_Gata1 <- 'only_Gata1' 
+  tmp@meta.data[onlySfpi1,]$expr_Sfpi1_Gata1 <- 'only_Sfpi1'
+
+  
+  table(tmp$expr_Sfpi1_Gata1, tmp$condition) %>% knitr::kable()
+  table(tmp$expr_Sfpi1_Gata1, tmp$clusterIdents) %>% knitr::kable()
+
+```
+
+
+
+### Percent of cells *by cluster* across categories of expression
+
+```{r perc cells by cluster across cat, results=TRUE, fig.width=4.5, fig.height=2}
+signif(scale((table(tmp$expr_Sfpi1_Gata1, tmp$clusterIdents)), 
+             scale=as.vector(table(tmp$clusterIdents)), center=FALSE)*100, 3) %>% knitr::kable()
+
+ scale((table(tmp$expr_Sfpi1_Gata1, tmp$clusterIdents)), 
+             scale=as.vector(table(tmp$clusterIdents)), center=FALSE) %>% 
+     reshape2::melt() %>% 
+     mutate(.,expr_cat=Var1) %>% 
+     mutate(cluster=Var2) %>% 
+     ggplot(., aes(x=cluster, y =value, fill=expr_cat)) + 
+     geom_bar(stat = 'identity') #+ 
+     # scale_fill_manual(values = cols_cyclone)
+
+
+```
+
+# FACS Plots  
+
+
+
+```{r}
+VlnPlot(object = subset(sceall_seurat, idents = c("EMP", "MP", "GP", "Mac", "Mast")) , features = 'CD16_32_MFI', cols = c("lightgrey", "lightgrey", "lightgrey", "lightgrey", "lightgrey")) +ylab('MFI')+ geom_boxplot(outlier.shape = NA) +theme(legend.position="none")
+
+```
+
+```{r}
+FeaturePlot(object = sceall_seurat , reduction = "umap", features  = "CD16_32_MFI",  pt.size = 1, min.cutoff="q10", max.cutoff="q95", dims = c(1,2), label = FALSE, cols = c("lightgrey", "royalblue4"))
+
+```
+
+
+
+# Data Storage
+
+The data with clustering results are saved in `r filesce`. The colors are saved in `r paste0(dirdata,'derived/FL/FL_cols.rds')`.
+
+```{r}
+sceall_seurat <- sceall_seuratbis
+saveRDS(sceall_seurat, file = filesce)
+saveRDS(cols, file=paste0(dirdata,'derived/FL/FL_cols.rds'))
+
+```
+
+
+
+