From a2349e304a1e7168ff316928548d2c93a5d97499 Mon Sep 17 00:00:00 2001
From: Claudia  CHICA <claudia.chica@pasteur.fr>
Date: Fri, 20 May 2022 18:13:54 +0200
Subject: [PATCH] Update Wednesday/MAF/EPITOME_CS_integration.Rmd

---
 Wednesday/MAF/EPITOME_CS_integration.Rmd | 1107 ++++++++++++++++++++++
 1 file changed, 1107 insertions(+)
 create mode 100644 Wednesday/MAF/EPITOME_CS_integration.Rmd

diff --git a/Wednesday/MAF/EPITOME_CS_integration.Rmd b/Wednesday/MAF/EPITOME_CS_integration.Rmd
new file mode 100644
index 0000000..09d0d10
--- /dev/null
+++ b/Wednesday/MAF/EPITOME_CS_integration.Rmd
@@ -0,0 +1,1107 @@
+---
+title: "Epigenomic memory of infection: Data integration"
+subtitle : "H3K4me2, transcriptome time courses and chromatin state"
+author: "Claudia Chica"
+date: "02.2021"
+output: 
+  html_document: 
+    keep_md: yes
+    number_sections: yes
+    smart: no
+    toc: yes
+    toc_float: yes
+editor_options: 
+  chunk_output_type: console
+---
+
+```{r setup, message = FALSE, warning = FALSE, echo = FALSE}
+knitr::opts_chunk$set(message=FALSE,warning=FALSE,echo=FALSE)#, cache=TRUE, cache.lazy = FALSE)
+knitr::opts_knit$set(progress = FALSE, verbose = FALSE)
+#knitr::opts_chunk$set(dev = "svg")
+
+WDIR="/Volumes/bioit/12891_Chevalier_EpiMemStrep"
+
+library(ggplot2)
+library(plyranges)
+library(RColorBrewer)
+library(knitr)
+library(pheatmap)
+library(patchwork)
+library(FactoMineR)
+library(org.Hs.eg.db)
+library(msigdbr)
+
+
+```
+
+# Methylation profile
+
+```{r loadEPIGresults}
+# Load epigenomic counts
+mark="H3K4me2"
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/countNormLength_",mark,".RData")
+load(fileOut)
+
+# Discard ONE peak with negative normalised values in a SINGLE replicate (artefact from cycliclowess normalisation)
+countNormLength_H3K4me2=countNormLength_H3K4me2[countNormLength_H3K4me2$norm.H3K4me2_UI_REP1>0,]
+
+# Get list of DM peaks
+files=list.files(path = paste0(WDIR,"/results_project12891/tables/tables_v3_UCSC/",mark,"/"),pattern = "resAnDif_with_annotation_sorted.final.xls", full.names = T)
+DM_results=list() ; i=1
+for (fileDM in files) {
+  no_col <- max(count.fields(fileDM, sep = "\t"))
+  tmp=read.table(fileDM,header = F,sep="\t", fill = TRUE,skip = 1,
+                 col.names=1:no_col,colClasses = c("character",rep("numeric",16),rep("character",no_col-17)))
+  row.names(tmp) <- tmp[,1]
+  tmp=tmp[,14:17]
+  colnames(tmp) <- c("baseMean", "log2FoldChange", "pvalue", "padj")
+  tmp=tmp[row.names(countNormLength_H3K4me2),]
+  DM_results[[i]]=tmp ; i=i+1
+}
+rm(tmp)
+names(DM_results) <- c("D7vsH3","D7vsUI","H3vsUI")
+
+# DM profiles
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/DMpeaks_",mark,".RData")
+load(fileOut)
+
+# Add not DEGs
+DMpeaks_H3K4me2=intersect(DMpeaks_H3K4me2,row.names(DM_results$D7vsUI))
+tmp=setdiff(row.names(countNormLength_H3K4me2),DMpeaks_H3K4me2)
+DMs=data.frame(peakID=DMpeaks_H3K4me2,
+               Kind="DM",
+Sign=ifelse(DM_results$D7vsUI[DMpeaks_H3K4me2,]$log2FoldChange>=0,"Gain","Loss"))
+notDMs=data.frame(peakID=tmp,
+                  Kind="notDM",
+Sign=ifelse(DM_results$D7vsUI[tmp,]$log2FoldChange>=0,"Gain","Loss"))
+DMs=rbind(DMs,notDMs)
+DMs$peakID=as.character(DMs$peakID)
+rm(tmp,notDMs)
+
+kable(table(DMs$Kind,DMs$Sign))
+```
+
+# Transcriptional profile
+```{r loadTOMEresults}
+# Expression data normalised and batch corrected 
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/Data_RMAandNoBatch_Annot.RData")
+load(fileOut)
+
+# DA results, same object used for the functional analysis
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/DE_results_Annot.RData")
+load(fileOut)
+
+SYMBOL2ENTREZ=unique(data.frame(SYMBOL=as.character(DE_results$H3vsUI$SYMBOL),
+                         ENTREZID=as.character(DE_results$H3vsUI$ENTREZID),stringsAsFactors = FALSE))
+
+# DEGs padjTh=.05 ; lfcTh=0
+fileOut=paste0(WDIR,"/results_project12891/tables/transcriptome/DEGs_H3K4me2profiles_CompleteAnnotationPeaks_nearestGene.txt")
+DEGs=read.table(fileOut,header = T)[,7:10]
+DEGs=unique(na.omit(DEGs))
+DEGs$SYMBOL=as.character(DEGs$SYMBOL)
+DEGs$ENTREZID=as.character(DEGs$ENTREZID)
+# Add not DEGs
+tmp=setdiff(SYMBOL2ENTREZ$ENTREZID,DEGs$ENTREZID)
+H3vsUI=DE_results$H3vsUI[!duplicated(DE_results$H3vsUI$ENTREZID),]
+DIvsD7=DE_results$DIvsD7[!duplicated(DE_results$DIvsD7$ENTREZID),]
+notDEGs=data.frame(ENTREZID=SYMBOL2ENTREZ[SYMBOL2ENTREZ$ENTREZID %in% tmp,]$ENTREZID,
+                   SYMBOL=SYMBOL2ENTREZ[SYMBOL2ENTREZ$ENTREZID %in% tmp,]$SYMBOL,
+                   Kind="notDE",
+# Sign=ifelse(H3vsUI[H3vsUI$ENTREZID %in% tmp,]$logFC>=0,
+#      ifelse(DIvsD7[DIvsD7$ENTREZID %in% tmp,]$logFC>=0,"Up","UpDown"),
+#      ifelse(DIvsD7[DIvsD7$ENTREZID %in% tmp,]$logFC<0,"Down","DownUp")))
+Sign=ifelse(H3vsUI[H3vsUI$ENTREZID %in% tmp,]$logFC>=0,"Up","Down"))
+DEGs=rbind(DEGs,notDEGs)
+DEGs$SYMBOL=as.character(DEGs$SYMBOL)
+DEGs$ENTREZID=as.character(DEGs$ENTREZID)
+rm(tmp,H3vsUI,DIvsD7,notDEGs)
+
+kable(table(DEGs$Kind,DEGs$Sign))
+
+```
+
+```{r load_peaks2Gene}
+mark="H3K4me2"
+### Nearest gene
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mergedPeaks_annotNearest_",mark,".RData")
+load(fileOut)
+peaks2GeneNearest_H3K4me2=data.frame(
+  peakId=mergedPeaks_annotNearest_H3K4me2@ranges@NAMES,
+  SYMBOL=as.character(mergedPeaks_annotNearest_H3K4me2@elementMetadata@listData[["mcols.GENESYMBOL"]]),
+  distance=mergedPeaks_annotNearest_H3K4me2@elementMetadata@listData[["distance"]],
+  stringsAsFactors=FALSE)
+peaks2GeneNearest_H3K4me2=merge(peaks2GeneNearest_H3K4me2,SYMBOL2ENTREZ,by.x=2,by.y=1,all.x=TRUE)[,c(2,1,3,4)]
+
+### Target gene from MEME
+# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mergedPeaks_annotTgene_",mark,".RData")
+# load(fileOut)
+# peaks2GeneTgene_H3K4me2=data.frame(peakId=mergedPeaks_annotTgene_H3K4me2$peakId,
+#                   SYMBOL=as.character(mergedPeaks_annotTgene_H3K4me2$Gene_Name),
+#                   ENTREZID=mergedPeaks_annotTgene_H3K4me2$ENTREZID,
+#                   distance=0,stringsAsFactors=FALSE)
+
+# Load ENTREZ ENSEMBL ids mapping to cross Tgene and EPI-TOME information
+ENSEMBLEids <- org.Hs.egENSEMBL 
+mapped_genes <- mappedkeys(ENSEMBLEids)
+ENTREZtoENSEMBL <- as.data.frame(ENSEMBLEids[mapped_genes])
+colnames(ENTREZtoENSEMBL)=c("ENTREZID","ENSEMBLEID")
+
+# Load Tgene MEME results
+fileIn=paste0(WDIR,"/results_project12891/MEME_Tgene/H2K4me2/links.tsv")
+mergedPeaks_Tgene=read.table(fileIn,header = T,sep = "\t")
+## Get only significant associations peak-gene
+pvalTh=.05 #; FDRth=.5
+mergedPeaks_TgeneSig=subset(mergedPeaks_Tgene,mergedPeaks_Tgene$CnD_P_Value < pvalTh)
+mergedPeaks_TgeneSig$chr=gsub("chr","",gsub(":[0-9-]*","",mergedPeaks_TgeneSig$RE_Locus))
+mergedPeaks_TgeneSig$start=as.character(as.integer(gsub("chr[0-9XY]*:","",gsub("-[0-9]*","",mergedPeaks_TgeneSig$RE_Locus)))-1)
+mergedPeaks_TgeneSig$end=gsub(".*-","",mergedPeaks_TgeneSig$RE_Locus)
+mergedPeaks_TgeneSig$peakID=apply(mergedPeaks_TgeneSig[,18:20],1,function(x){paste0(x[1],"_",x[2],"_",x[3])})
+mergedPeaks_TgeneSig$ENSEMBL=gsub("\\.[0-9-]*","",mergedPeaks_TgeneSig$Gene_ID)
+peaks2GeneTgene_H3K4me2=na.omit(merge(mergedPeaks_TgeneSig,
+                           ENTREZtoENSEMBL,by.x=22,by.y=2,all.x=TRUE)[,c(22,23,10,14,17)])
+
+# Select unique gene-region associations 
+# select association minimun Tgene pvalue 
+peaks2GeneTgene_H3K4me2$peak2gene=paste0(peaks2GeneTgene_H3K4me2$peakID,"_",
+  peaks2GeneTgene_H3K4me2$ENTREZID)
+peaks2GeneTgene_H3K4me2$tmp=paste0(peaks2GeneTgene_H3K4me2$peakID,"_",
+  peaks2GeneTgene_H3K4me2$ENTREZID,"_",as.character(peaks2GeneTgene_H3K4me2$CnD_P_Value))
+peaks2GeneTgene_H3K4me2=peaks2GeneTgene_H3K4me2[!duplicated(peaks2GeneTgene_H3K4me2$tmp),]
+
+tmp=aggregate(peaks2GeneTgene_H3K4me2$CnD_P_Value,list(peaks2GeneTgene_H3K4me2$peak2gene),min)
+tmp$tmp=paste0(tmp$Group.1,"_",as.character(tmp$x))
+peaks2GeneTgene_H3K4me2=peaks2GeneTgene_H3K4me2[peaks2GeneTgene_H3K4me2$tmp %in% tmp$tmp,-7]
+
+rm(mergedPeaks_TgeneSig,ENSEMBLEids,mapped_genes,ENTREZtoENSEMBL)
+```
+
+# Chromatin states
+
+```{r selectPeaks, eval=FALSE, include=FALSE}
+# Select peaks using counts
+logFCthUp_H3K4me2=.005 #quantile .75
+logFCthDown_H3K4me2=-.005 #quantile .25
+tmp=countNormLength_H3K4me2
+selPeaksAllRep=row.names(
+  subset(tmp,(
+  (log(tmp$norm.H3K4me2_D7_REP1/tmp$norm.H3K4me2_UI_REP1,2) > logFCthUp_H3K4me2 & 
+   log(tmp$norm.H3K4me2_D7_REP2/tmp$norm.H3K4me2_UI_REP2,2) > logFCthUp_H3K4me2)  |
+  (log(tmp$norm.H3K4me2_D7_REP1/tmp$norm.H3K4me2_UI_REP1,2) < logFCthDown_H3K4me2 &  
+   log(tmp$norm.H3K4me2_D7_REP2/tmp$norm.H3K4me2_UI_REP2,2) < logFCthDown_H3K4me2))))
+
+selPeaksAll=row.names(tmp)
+
+# Select peaks related to DEGs 
+selPeaks_DEGs_nearest=unique(merge(DEGs[DEGs$Kind!="notDE",],peaks2GeneNearest_H3K4me2[,c(4,1)],by=1)[,5])
+selPeaks_DEGs_target=unique(merge(DEGs[DEGs$Kind!="notDE",],peaks2GeneTgene_H3K4me2[,c(3,1)],by=1)[,5])
+
+print(c("Number of:"))
+print(c("Total peaks",dim(tmp)[1]))
+print(c("Selected peaks - methylation logFold",length(selPeaksAll)))
+print(c("Selected peaks - reproducible methylation logFold",length(selPeaksAllRep)))
+print(c("Selected peaks - nearest to DEG",length(selPeaks_DEGs_nearest)))
+print(c("Selected peaks - with DEG target",length(selPeaks_DEGs_target)))
+
+print(c("Methylation logFold nearest to DEG",length(intersect(selPeaksAll,selPeaks_DEGs_nearest))))
+print(c("Methylation logFold with DEG target",length(intersect(selPeaksAll,selPeaks_DEGs_target))))
+print(c("Reproducible methylation logFold nearest to DEG",length(intersect(selPeaksAllRep,selPeaks_DEGs_nearest))))
+print(c("Reproducible methylation logFold with DEG target",length(intersect(selPeaksAllRep,selPeaks_DEGs_target))))
+
+ggplotdf=data.frame(log2FC=c(
+  DM_results$D7vsUI[selPeaksAll,]$log2FoldChange,
+  DM_results$D7vsUI[selPeaksAllRep,]$log2FoldChange,
+  DM_results$D7vsUI[selPeaks_DEGs_nearest,]$log2FoldChange,
+  DM_results$D7vsUI[selPeaks_DEGs_target,]$log2FoldChange),
+  peakPopulation=factor(c(rep("All peaks",length(selPeaksAll)),
+                   rep("Reproducible Log2FC",length(selPeaksAllRep)),
+                   rep("DEGs nearest",length(selPeaks_DEGs_nearest)),
+                   rep("DEGs target",length(selPeaks_DEGs_target))),
+                   levels = c("All peaks","Reproducible Log2FC","DEGs nearest","DEGs target")))
+
+ggplot(ggplotdf, aes(x=log2FC,fill=peakPopulation)) +
+    geom_density(alpha=0.5) + facet_grid(~ peakPopulation)
+```
+
+```{r get_mergedPeaks_annotTXS}
+# Load all merged peaks coords
+fileIn=paste0(WDIR,"/results_project12891/tables/tables_v3_UCSC/",mark,"/",mark,"_mergedPeaks.bed")
+mergedPeaks=read_bed(fileIn)
+
+# Annotate merged peaks
+fileAnnot=paste0(WDIR,"/results_project12891/igv/igv_session_v3/annotation/hg19_transcripts.geneSymbol.reformat.bed")
+txs_gr <- read_bed(fileAnnot)
+tss_gr <- GRanges(seqnames = txs_gr@seqnames,
+                   ranges = IRanges(start = txs_gr@ranges@start-1e3, end = txs_gr@ranges@start+1e3),
+                   strand = txs_gr@strand,
+                   mcols = data.frame(GENESYMBOL=mcols(txs_gr)$name))
+txs_gr <- GRanges(seqnames = txs_gr@seqnames,
+                   ranges = txs_gr@ranges,
+                   strand = txs_gr@strand,
+                   mcols = data.frame(GENESYMBOL=mcols(txs_gr)$name))
+
+mergedPeaks_annotTXS=array(NA,dim=length(mergedPeaks)) ; names(mergedPeaks_annotTXS)=mergedPeaks$name
+ovtss=unique(findOverlaps(mergedPeaks,tss_gr)@from)
+mergedPeaks_annotTXS[ovtss]="TSS"
+ovintraG=setdiff(unique(findOverlaps(mergedPeaks,txs_gr)@from),ovtss)
+mergedPeaks_annotTXS[ovintraG]="intraG"
+mergedPeaks_annotTXS[is.na(mergedPeaks_annotTXS)]="interG"
+```
+
+```{r annotationQuantification}
+DIR=paste0(WDIR,"/results_project12891/Encode_data/bigWig/")
+fileMetadata=paste0(DIR,"ENCODE_metadata_bigWig_foldChangeOverControl.released.txt")
+Encode_metadata <- read.table(file=fileMetadata,header=TRUE,sep="\t")
+
+marks=c("H3K4me3", "H3K4me2", "H3K27ac", "H3K4me1", "H3K79me2", "H3K27me3")
+AnnotationCols <- data.frame(
+  Mark=factor(sub("-human","",Encode_metadata$Experiment.target),levels = marks),
+  row.names = Encode_metadata$File.accession)
+
+colsG=brewer.pal(n = 3, name = "Greys")
+colsM=brewer.pal(n = 4, name = "Greens")
+colsS=brewer.pal(n = 3, name = "Blues")[2:3]
+nClust=8 ; colsC=brewer.pal(n = nClust, name = "Set2")
+colsAnnot <- list(Mark=c("H3K4me3"=colsS[1], "H3K4me2"=colsS[2], "H3K27ac"=colsM[1], "H3K4me1"=colsM[2], "H3K79me2"=colsM[3], "H3K27me3"=colsM[4]),
+  DM_sign=c("Gain"="#FF0000", "Loss"="#FF000030"),
+  DM_kind=c("DM"="#000000", "notDM"="transparent"),
+  Localization=c("TSS"=colsG[1], "intraG"=colsG[2], "interG"=colsG[3]),
+  CS=c("1"=colsC[1],"2"=colsC[2],"3"=colsC[3],"4"=colsC[4],"5"=colsC[5],"6"=colsC[6],"7"=colsC[7],"8"=colsC[8]))
+```
+
+```{bash getMtx_ENCODElogchange, eval=FALSE, include=FALSE}
+WDIR=/pasteur/projets/policy01/BioIT/12891_Chevalier_EpiMemStrep/results_project12891/
+module load UCSC-tools/v373
+module load bedtools/2.25.0
+
+# 1. Download bigWig files
+# 2. Get bedGraph files
+cd Encode_data/bigWig/
+for file in *.bigWig ; do 
+if [ ! -f ${file/bigWig/bedGraph} ] ; then echo $file ; 
+srun bigWigToBedGraph $file ${file/bigWig/bedGraph} &  fi ; done
+
+# 3. Overlap with mergedPeaks
+mark=H3K4me2
+filePeaks=$WDIR/tables/tables_v3_UCSC/*/${mark}_mergedPeaks.bed
+cd $WDIR/Encode_data/bigWig
+
+# 3.1. Small files
+emacs overlapBedGraph_mergedPeaks.sh
+#!/bin/sh
+i=$(head -n ${SLURM_ARRAY_TASK_ID} $1 | tail -n 1) 
+WDIR=$2
+filePeaks=$3
+mark=$4
+
+if [ ! -f $WDIR/${i/.bedGraph/_mergedPeaks_${mark}.ov} ] ; then 
+srun -o $i.ovb -e $i.e intersectBed -a $filePeaks -b $i -wb
+awk '{s[$4]=s[$4]+$NF ; n[$4]=n[$4]+1} END {for (p in n) {print p"\t"s[p]/n[p]}}' $i.ovb | 
+sort -k1,1 > $WDIR/${i/.bedGraph/_mergedPeaks_${mark}.ov} ; fi
+
+ls *bedGraph > fileList.txt
+n=`wc -l fileList.txt | awk '{print $1}'`
+sbatch -p hubbioit --qos hubbioit -o ov.o -e ov.e --mem=200G --array=1-$n%$n overlapBedGraph_mergedPeaks.sh fileList.txt $WDIR/Encode_data/bigWig/ $filePeaks $mark
+
+# 4 Assemble matrix
+echo "Peak_name" > header0 ; i=0
+for file in *$mark*ov ; do let j=$i+1
+echo ${file/_mergedPeaks_${mark}.ov} | paste header$i - > header$j ; ((i++)) ; done
+paste *$mark*ov | awk '{line=$1 ; for (i=2;i<=NF;i=i+2) {line=line"\t"$i} ; print line}' |
+cat header$j - > ENCODE_mergedPeaks_${mark}.ov
+
+# Clean
+rm $WDIR/Encode_data/bigWig/*.ovb $WDIR/Encode_data/bigWig/*.bedGraph
+rm $WDIR/Encode_data/bigWig/*.e $WDIR/Encode_data/bigWig/ov.o
+rm header*
+```
+
+```{r ENCODE_bigWig}
+DIR=paste0(WDIR,"/results_project12891/Encode_data/bigWig/")
+fileSignal=paste0(DIR,"ENCODE_mergedPeaks_",mark,".ov")
+matSignal <- read.table(file=fileSignal,header=TRUE,row.names = 1,sep="\t")
+
+markOrder=c(#H3K4me3
+  "ENCFF628ANV", "ENCFF510LTC", "ENCFF556OVF",
+  #"ENCFF313GRK", "ENCFF709KWY", "ENCFF635VEW",
+  "ENCFF250OYQ", "ENCFF189JCB", "ENCFF421ZNH",
+  #H3K4me2
+  "ENCFF876DTJ", "ENCFF975YWP", "ENCFF419LFZ",
+  #H3K4me1
+  "ENCFF165ZPD", "ENCFF569ZJQ", "ENCFF613NHX",
+  #H3K27ac
+  #"ENCFF216GUZ", "ENCFF597SRK", "ENCFF562LEK",
+  "ENCFF137KNW", "ENCFF103BLQ", "ENCFF663ILW",
+  #H3K79me2
+  "ENCFF892LYD", "ENCFF154HXA", "ENCFF931LYX",
+  #H3K27me3
+  "ENCFF819QSK", "ENCFF134YLO", "ENCFF336AWS")
+```
+
+```{r ENCODE_stats}
+ENCODE_stats <- function(matSignalSel,selPeaks,mergedPeaks_annotTXS,DMs) {
+allZero=which(apply(matSignalSel,1,sum)==0)
+if (length(allZero)>0) matSignalSel=matSignalSel[-allZero,]
+
+mtx=t(matSignalSel)
+d=dist(t(scale(mtx)))
+hc=hclust(d,method = "ward.D2")
+nClust=8
+clust=cutree(hc,k=nClust)
+
+selPeaks=intersect(selPeaks,row.names(matSignalSel))
+
+# Complete AnnotationRows
+AnnotationRows <- data.frame(
+  Localization=factor(mergedPeaks_annotTXS[selPeaks],levels = c("TSS","intraG","interG")),
+  CS=factor(clust[selPeaks]),
+  row.names = selPeaks
+)
+AnnotationRows=merge(AnnotationRows,DMs,by.x=0,by.y=1)
+row.names(AnnotationRows)=AnnotationRows$Row.names ; AnnotationRows=AnnotationRows[,-1]
+colnames(AnnotationRows)[3:4]=c("DM_kind","DM_sign")
+
+return(AnnotationRows)
+} 
+```
+
+```{r ENCODE_heatmap, fig.height=10, fig.width=8}
+ENCODE_heatmap <- function(matSignalSel,AnnotationRows,colsAnnot) {
+breaks=seq(-4,4,.1) ; cols=colorRampPalette(c("purple","black","yellow"))(length(breaks)+1)
+
+allZero=which(apply(matSignalSel,1,sum)==0)
+if (length(allZero)>0) matSignalSel=matSignalSel[-allZero,]
+
+pheatmap(matSignalSel, 
+         scale = "row", cluster_rows = T, cluster_cols = F,
+         clustering_method = "ward.D2",
+         show_rownames = F, show_colnames = F,
+         annotation_col = subset(AnnotationCols,row.names(AnnotationCols) %in% markOrder), 
+         annotation_row = AnnotationRows, 
+         annotation_colors = colsAnnot,
+         color = cols, border = NA,
+         #main = paste(mark,"DM peaks",length(DMpeaks)),
+         fontsize = 12)
+}
+```
+
+## All H3K4me2 peaks
+
+```{r ENCODE_counts, fig.height=10, fig.width=8, cache=TRUE}
+selPeaksAll=row.names(countNormLength_H3K4me2)
+selPeaks=selPeaksAll
+matSignalSel=na.omit(matSignal[selPeaks,markOrder])
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/AnnotationRows_allPeaks.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+AnnotationRows=ENCODE_stats(matSignalSel,selPeaks,mergedPeaks_annotTXS,DMs)
+AnnotationRows_allPeaks=AnnotationRows
+save(AnnotationRows_allPeaks,file=fileOut)}
+
+# Tables
+AnnotationRows=AnnotationRows_allPeaks
+t=table(AnnotationRows$CS)
+kable(ceiling(t/sum(t)*100),col.names = c("CS","Freq"))
+
+t1=table(AnnotationRows$Localization)
+tt=table(AnnotationRows$CS,AnnotationRows$Localization)
+kable(ceiling(t1/sum(t1)*100),col.names = c("Localization","Freq"))
+kable(ceiling(tt/as.numeric(t)*100),row.names = T)
+# 
+# t1=table(AnnotationRows$DM_sign)
+# kable(ceiling(t1/sum(t1)*100),col.names = c("H3K4me2 Profile","Freq"))
+# tt=table(AnnotationRows$CS,AnnotationRows$DM_sign)
+# kable(ceiling(tt/as.numeric(t)*100),row.names = T)
+# 
+# t1=table(AnnotationRows$DM_kind)
+# kable(ceiling(t1/sum(t1)*100),col.names = c("H3K4me2 Dynamic","Freq"))
+# tt=table(AnnotationRows$CS,AnnotationRows$DM_kind)
+# kable(ceiling(tt/as.numeric(t)*100),row.names = T)
+
+ENCODE_heatmap(matSignalSel,AnnotationRows,colsAnnot)
+```
+
+# Multi factorial analysis
+
+Multi factorial analysis (MFA) (Escofier & Pages, Computational Statistics & Data Analysis, 1994) studies several groups of variables (numerical and/or categorical) defined on the same set of individuals. MFA is a factor analysis applied to the array including all groups of variables. Roughly, the behaviour of the method is equivalent to PCA (concerning quantitative variables) or to MCA (concerning qualitative variables).
+
+
+```{r MFA_functions}
+#http://juliejosse.com/wp-content/uploads/2019/03/MFA_staf2015.pdf
+
+individualContributionPlot <- function(mfa.res,nPeaks) {
+contribs=data.frame()
+for (i in 1:5) {
+rs=mfa.res[["ind"]][["contrib"]][order(mfa.res[["ind"]][["contrib"]][,i],decreasing = TRUE),i] %>% cumsum() 
+peakId=names(rs)
+contribs=rbind(contribs,data.frame(peakId,seq(1,length(rs),1),as.vector(rs),rep(i,length(rs))))}
+colnames(contribs)=c("peakId","Rank","RankedSum","Dimension")
+contribs$Dimension=factor(contribs$Dimension)
+
+p <- ggplot(contribs,aes(x=Rank,y=RankedSum)) + geom_point(aes(color=Dimension))
+
+print(p)
+
+return(contribs)
+}
+
+MFAstats <- function(mfa.res,xlim,ylim) {
+# Eigen values of separate analysis
+# Describe dimensionality of each variable group
+#print(kable(sapply(mfa.res[["separate.analyses"]],"[[", 1),
+#            caption = "Eigen values of separate analysis"))
+# Eigen values of global analysis
+# How many components are necessary to describe the total inertia of the dataset
+print(kable(mfa.res[["eig"]][1:10,],
+            caption = "Eigen values of global analysis"))
+# Correlation coeffs
+# How much of the stt defined by component C depends on groups(s) G : G -> C
+# Fundamental to see if there are structures COMMON to groups -> useful to integrate 
+print(kable(mfa.res[["group"]][["correlation"]],caption = "Group correlation with factor"))
+# Inertia or group contribution 
+# How much of the inertia of group G is explained by component C : C -> G
+print(kable(mfa.res[["group"]][["cos2"]],caption = "Group inertia per factor"))
+# Group contribution 
+plotGrouprepresentation(mfa.res)
+# Invidual contribution to each component
+# Determine wheter an axis is due to only some individuals 
+#mfa.res[["ind"]][["contrib"]]
+#individualContributions=individualContributionPlot(mfa.res,nPeaks)
+# Categorical variable representation (centroid of inds )
+# Determine how are 
+#dimdesc(mfa.res)$Dim.2
+#splotIndrepresentation(mfa.res,xlim,ylim)
+# Variable contribution 
+plotVarrepresentation(mfa.res)
+}
+
+plotGrouprepresentation <- function(mfa.res) {
+p1 <- plot.MFA(mfa.res,choix = "group",axes = c(1,2), title = "")
+p2 <- plot.MFA(mfa.res,choix = "group",axes = c(1,3), title = "")
+p3 <- plot.MFA(mfa.res,choix = "group",axes = c(1,4), title = "")
+p4 <- plot.MFA(mfa.res,choix = "group",axes = c(1,5), title = "")
+print((p1 | p2) / (p3 | p4))
+}
+
+plotIndrepresentation <- function(mfa.res,xlim,ylim) {
+  if (ylim!="" & xlim!="") {
+p1 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,2), 
+               title = "",ylim=ylim,xlim=xlim)
+p2 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,3),
+               title = "",ylim=ylim,xlim=xlim)
+p3 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,4),
+               title = "",ylim=ylim,xlim=xlim)
+p4 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,5),           
+               title = "",ylim=ylim,xlim=xlim) } else  
+{
+p1 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,2), title = "")
+p2 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,3), title = "")
+p3 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,4), title = "")
+p4 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,5), title = "")
+  
+}
+print((p1 | p2) / (p3 | p4))
+}
+
+plotVarrepresentation <- function(mfa.res) {
+  title=""
+  p1 <- plot.MFA(mfa.res,choix="var", axes=c(1,2),title=title,cex=.7)
+  p2 <- plot.MFA(mfa.res,choix="var", axes=c(1,3),title=title,cex=.7)
+  p3 <- plot.MFA(mfa.res,choix="var", axes=c(1,4),title=title,cex=.7)
+  p4 <- plot.MFA(mfa.res,choix="var", axes=c(1,5),title=title,cex=.7)
+  
+print((p1 | p2) / (p3 | p4) + plot_layout(guides = "collect"))
+
+}
+
+clusterStats <- function(hcpc.res) {
+### Cluster description
+## By variables: 
+# Test if variable (OMIC samples) values distribution per cluster is random or not 
+# For quantitative variables
+print(kable(hcpc.res[["desc.var"]]$quanti,caption = "Description by quantitative variables"))
+# For qualitative variables
+print(kable(hcpc.res[["desc.var"]][["category"]],caption = "Description by qualitative variables"))
+## By components 
+# Test if individuals coordinates distribution per cluster is random or not 
+# For quantitative variables
+print(kable(hcpc.res[["desc.axes"]][["quanti"]],
+            caption = "Description by factors (quantitative variables)"))
+# For qualitative variables
+print(kable(hcpc.res[["desc.var"]][["test.chi2"]],
+            caption = "Description by factors (qualitative variables)"))
+## By individuals (genomic regions)
+# Nearest individuals to the cluster's centroid
+print("Description by individuals (nearest):")
+print(hcpc.res[["desc.ind"]]$para)
+# Furthest individuals to the cluster's centroid
+print("Description by individuals (furthest):")
+print(hcpc.res[["desc.ind"]]$dist)
+}
+
+```
+
+```{r MFA_prepareMtx}
+# Differential analysis info
+AnnotationRows=AnnotationRows_allPeaks[,2:4]
+#DEGs2peaks=merge(DEGs,peaks2GeneTgene_H3K4me2[,c(3,1)],by=1)
+#AnnotationRows=merge(AnnotationRows,DEGs2peaks[,c(5,4,3,1)],by.x=0,by.y=1)
+DEGs2peaks=merge(DEGs,peaks2GeneTgene_H3K4me2,by.x=1,by.y=2)
+AnnotationRows=merge(AnnotationRows,DEGs2peaks[,c(5,3,4,1,2,6:8)],by.x=0,by.y=1)
+AnnotationRows$Distance=log(abs(AnnotationRows$Distance)+1,10)
+#AnnotationRows=AnnotationRows[!duplicated(AnnotationRows[,1]),]
+#AnnotationRows <- AnnotationRows[,-c(1,8)] ; 
+colnames(AnnotationRows)[5:6] <- c("DEG_kind","DEG_sign")
+AnnotationRows$DEG_kind=factor(AnnotationRows$DEG_kind,levels = c("common","only_H3vsUI","only_DIvsD7","notDE"))
+#AnnotationRows$DEG_sign=factor(AnnotationRows$DEG_sign,levels = c("Up","Down","UpDown","DownUp"))
+AnnotationRows$DEG_sign=factor(AnnotationRows$DEG_sign,levels = c("Up","Down"))
+#Re-order columns
+AnnotationRows=AnnotationRows[,c(1,7:11,2:6)]
+
+# Merge methylation, transcription and epigenome signal
+METHOME=merge(t(scale(t(countNormLength_H3K4me2))),
+              DM_results$D7vsUI[,1:2],by=0)
+colnames(METHOME)[8:9]=c("H3K4me2_log2baseMean","H3K4me2_D7vsUI_log2FC")
+METHOME$H3K4me2_log2baseMean=log(METHOME$H3K4me2_log2baseMean,2)
+
+# Merge TOME data
+Data=Data_RMA_NoBatchComBat_Annot
+t=na.omit(Data[!duplicated(Data$ENTREZID),])[,c(1,5,9,2,6,10,3,7,11,4,8,12)]
+TOME=log(t,2)
+row.names(TOME)=na.omit(Data[!duplicated(Data$ENTREZID),14])
+
+# Merge ENCODE data
+samples=AnnotationCols[markOrder,]
+names(samples)=markOrder
+ENCODEmean=t(scale(t(
+  data.frame(
+  H3K4me3=apply(matSignal[,names(samples[samples=="H3K4me3"])],1,function(x) mean((x))),
+  H3K4me2=apply(matSignal[,names(samples[samples=="H3K4me2"])],1,function(x) mean((x))),
+  H3K4me1=apply(matSignal[,names(samples[samples=="H3K4me1"])],1,function(x) mean((x))),
+  H3K27ac=apply(matSignal[,names(samples[samples=="H3K27ac"])],1,function(x) mean((x))),
+  H3K79me2=apply(matSignal[,names(samples[samples=="H3K79me2"])],1,function(x) mean((x))),
+  H3K27me3=apply(matSignal[,names(samples[samples=="H3K27me3"])],1,function(x) mean((x))),
+  row.names = row.names(matSignal))
+)))
+
+# EPITOME all 
+EPITOME=merge(AnnotationRows,METHOME,by=1)
+EPITOME=merge(EPITOME,ENCODEmean,by.x=1,by.y=0)
+EPITOME=merge(EPITOME,TOME,by.x=2,by.y=0)
+row.names(EPITOME)=paste0(EPITOME$Row.names,"_",EPITOME$ENTREZID)
+colnames(EPITOME)[2]="peakID"
+```
+
+```{r MFA_getMtx1}
+
+#### Get INPUT matrix
+
+# Select all peak2gene associations where there is a DMR or a DEG
+mfa.EPITOME=subset(EPITOME,EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE")
+corrTh=.4
+mfa.EPITOME_corrTh=subset(EPITOME,(EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") &
+                    abs(EPITOME$Correlation)>corrTh)
+distTh=20e3
+mfa.EPITOME_distTh=subset(EPITOME,(EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") &
+                   EPITOME$Distance<log(distTh+1,10))
+# Save INPUT matix objects
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME.RData")
+if (!file.exists(fileOut)) {save(mfa.EPITOME,file = fileOut)}
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME_distTh.RData")
+if (!file.exists(fileOut)) {save(mfa.EPITOME_distTh,file = fileOut)}
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME_corrTh.RData")
+if (!file.exists(fileOut)) {save(mfa.EPITOME_corrTh,file = fileOut)}
+
+```
+
+## Input matrix
+
+MFA is performed on the `r dim(mfa.EPITOME)[1]` peak-gene associations that include a DMR or a DEG. This is the distribution of quantitative and qualitative group variables.
+
+```{r MFA_getMtx2}
+# Check matrix behaviour
+boxplot(mfa.EPITOME[,c(26:37,18:19,4,5,20:25)],las=2,pch=19,cex=.5,
+        ylab="Scaled values by group",cex.axis=.8,
+        col=c(rep(palette()[8],12),rep(palette()[6],2),
+              rep(palette()[3],1),rep(palette()[4],1),rep(palette()[5],6)),
+        main="Distribution of quantitative variables for MFA")
+
+kable(table(mfa.EPITOME$DEG_kind,mfa.EPITOME$DEG_sign),caption = "DEGs for MFA")
+kable(table(mfa.EPITOME$DM_kind,mfa.EPITOME$DM_sign),caption = "DMRs for MFA")
+```
+
+## MFA components description
+
+```{r MFA, fig.height=10, fig.width=10, cache=TRUE}
+
+#### MFA
+colInds=c(26:37,18:19,4,5,20:25,7,10:11,8:9)
+
+# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh.RData")
+# if (file.exists(fileOut)) {load(fileOut)} else {
+#   mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh  <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, 
+#     group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+#     name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6))
+# save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh,file=fileOut)}
+# 
+# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh.RData")
+# if (file.exists(fileOut)) {load(fileOut)} else {
+#   mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh  <- MFA(mfa.EPITOME_corrTh[,colInds], ncp=5, 
+#     group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+#     name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6))
+# save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+  mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS  <- MFA(mfa.EPITOME[,colInds], ncp=5, 
+    group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+    name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6))
+save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS,file=fileOut)}
+
+#### MFA stats
+mfa.res=mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS
+title="" ; xlim=c(-1,1) ; ylim=c(-1,1)
+MFAstats(mfa.res,"","")
+#print(plot.MFA(mfa.res,choix="axes", axes=c(1,2),title=title))
+plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), ylim=c(-1,2))
+p1 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), xlim=xlim, ylim=ylim)
+p2 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,2), xlim=xlim, ylim=ylim)
+p3 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,4), xlim=xlim, ylim=ylim)
+p4 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(5,4), xlim=xlim, ylim=ylim)
+print((p1 | p2) / (p3 | p4))
+```
+
+# Hierarchical clustering on MFA components
+
+Clustering is performed on the first 5 components obtained with the MFA.
+
+```{r HCPCclustering, fig.height=10, fig.width=10, cache=TRUE}
+
+#### Clustering
+nClust=12 ; maxClustSize=1400
+hcpc.res <- HCPC(mfa.res, nb.clust=nClust, conso=0, min=3, max=10,
+                 metric = "euclidean", method="ward",graph = FALSE,nb.par = maxClustSize)
+
+#clusterStats(hcpc.res)
+
+# Cluster description in term of qualitative variables
+lapply(hcpc.res[["desc.var"]][["category"]],function(x) {kable(subset(x,x[,"v.test"]>5.3))})
+
+# Cluster description
+tmp=hcpc.res[["data.clust"]][,23:28]
+table=merge(mfa.EPITOME[,1:6],tmp,by=0)
+colnames(table)[1]="peakIDgene_association"
+colnames(table)[13]="MFAclust"
+
+layout(matrix(1:2,nrow = 2))
+boxplot(table$Distance ~ table$MFAclust,pch=19,cex=.5,
+        xlab="",ylab="Distance peak - gene")
+boxplot(table$Correlation ~ table$MFAclust,pch=19,cex=.5,
+        xlab="MFA cluster",ylab="Correlation peak_hist-gene_exp")
+
+# Table of associations with clusters
+fileOut=paste0(WDIR,"/results_project12891/tables/integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt")
+if (!file.exists(fileOut)) {
+# Add individual data
+for (i in 1:nClust) {
+  table=merge(table,as.data.frame(hcpc.res[["desc.ind"]][["para"]][[i]]),by.x=1,by.y=0,all.x=TRUE)
+  colnames(table)[dim(table)[2]]=paste0("distFromCentroid_MFAclust",i)
+}
+
+write.table(table,file = fileOut,sep="\t",quote=FALSE,row.names = FALSE, col.names = TRUE)
+}
+
+```
+
+## Statistics on clusters
+
+Number of total genes/peaks per cluster (Ngenes,Npeaks). Number of genes/peaks belonging only to a given cluster (NgeneOnlyClust, NpeakOnlyClust). Nearest cluster in terms of shared genes/peaks (nearestClustGene,nearestClustPeak) and corresponding number of shared genes/peaks (nearestClustNgene,nearestClustNpeak).
+
+```{r}
+geneFreq_allAss=table(table$ENTREZID)
+singleClustGenes=names(subset(geneFreq_allAss,geneFreq_allAss==1))
+peakFreq_allAss=table(table$peakID)
+singleClustPeaks=names(subset(peakFreq_allAss,peakFreq_allAss==1))
+clusts=split(table,table$MFAclust)
+geneClust=unique(table[,c(2,13)])
+peakClust=unique(table[,c(3,13)])
+
+clustStats=data.frame(Counts=c("Ngenes","NgeneOnlyClust","nearestClustGene","nearestClustNgene",
+                               "Npeaks","NpeaksOnlyClust","nearestClustPeak","nearestClustNpeak")) ; i=1
+for (clust in clusts) {
+  Ngenes=length(unique(clust$ENTREZID))
+  NgeneOnlyClust=length(setdiff(unique(clust$ENTREZID),
+                        subset(geneClust,geneClust$MFAclust!=i)$ENTREZID))
+  nearestClustGene=names(sort(table(geneClust[geneClust$ENTREZID %in% unique(clust$ENTREZID),]$MFAclust),decreasing = T)[2])
+  nearestClustNgene=as.numeric(sort(table(geneClust[geneClust$ENTREZID %in% unique(clust$ENTREZID),]$MFAclust),decreasing = T)[2])
+  Npeaks=length(unique(clust$peakID))
+  NpeaksOnlyClust=length(setdiff(unique(clust$peakID),
+                         subset(peakClust,peakClust$MFAclust!=i)$peakID))
+  nearestClustPeak=names(sort(table(peakClust[peakClust$peakID %in% unique(clust$peakID),]$MFAclust),decreasing = T)[2])
+  nearestClustNpeak=as.numeric(sort(table(peakClust[peakClust$peakID %in% unique(clust$peakID),]$MFAclust),decreasing = T)[2])
+  clustStats=cbind(clustStats,c(Ngenes,NgeneOnlyClust,nearestClustGene,nearestClustNgene,Npeaks,NpeaksOnlyClust,nearestClustPeak,nearestClustNpeak)) ; i=i+1
+}
+colnames(clustStats)=c("Counts",seq(1,nClust,1))
+kable(clustStats)
+```
+
+
+# Explore gene sets from functional analysis 
+
+Check representation of selected gene sets (GS) among genes of every MFA cluster. Values correspond to the ratio between the observed number of genes belonging to a given GS and the expected number per cluster. 
+
+```{r functions_genesetClusterRepresentation, cache=TRUE}
+getClustGSenrich <- function(table,selectedGSs,m_t2g_lists) {
+  tmp=unique(table[,c(2,13)])
+  t=table(tmp$MFAclust)
+  expected=t/sum(t)
+  overRep=data.frame(matrix(nrow = length(selectedGSs),ncol = nClust))
+  colnames(overRep)=seq(1,nClust,1) ; row.names(overRep)=selectedGSs
+  for (selectedGS in selectedGSs) {
+    ENTREZ_IDs=unique(c(m_t2g_lists[[selectedGS]]))
+    subTable=tmp[tmp$ENTREZID %in% ENTREZ_IDs,]
+  
+    # Clustering distribution
+    obs=table(subTable$MFAclust)/sum(table(subTable$MFAclust))
+  
+    overRep[selectedGS,] <- obs/expected
+  }
+    return(overRep)
+
+}
+
+m_t2g_df <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
+  dplyr::select(gs_subcat, gs_name, entrez_gene)
+tmp=m_t2g_df[m_t2g_df$gs_subcat=="CP:REACTOME",]
+m_t2g_list_reactome=split(as.character(tmp$entrez_gene),gsub("REACTOME_","",tmp$gs_name))
+# tmp=m_t2g_df[m_t2g_df$gs_subcat=="CP:KEGG",]
+# m_t2g_list_kegg=split(as.character(tmp$entrez_gene),gsub("KEGG_","",tmp$gs_name))
+
+# m_t2g_lists=c(m_t2g_list_reactome,m_t2g_list_kegg)
+m_t2g_lists=m_t2g_list_reactome
+```
+
+## GS specific to 1st infection
+```{r genesetClusterRepresentation_only1st}
+  
+selectedGSs_only1st=c("NUCLEAR_RECEPTOR_TRANSCRIPTION_PATHWAY",
+              "CHEMOKINE_RECEPTORS_BIND_CHEMOKINES",
+              "CHROMATIN_MODIFYING_ENZYMES",
+              "PKMTS_METHYLATE_HISTONE_LYSINES",
+              "HATS_ACETYLATE_HISTONES",
+              "VESICLE_MEDIATED_TRANSPORT",
+              "MEMBRANE_TRAFFICKING",
+              "DNA_DOUBLE_STRAND_BREAK_RESPONSE",
+              "ANTIGEN_PROCESSING_UBIQUITINATION_PROTEASOME_DEGRADATION",
+              "PROCESSING_OF_DNA_DOUBLE_STRAND_BREAK_ENDS",
+              "DNA_DAMAGE_TELOMERE_STRESS_INDUCED_SENESCENCE")
+tab=getClustGSenrich(table,selectedGSs_only1st,m_t2g_lists)
+kable(tab,digits = rep(1,nClust))
+```              
+
+## GS specific to 2nd infection
+```{r genesetClusterRepresentation_only2nd}
+selectedGSs_only2nd=c("INFLUENZA_INFECTION",
+              "CELLULAR_RESPONSES_TO_EXTERNAL_STIMULI",
+              "INFECTIOUS_DISEASE","DISEASE",
+              "SIGNALING_BY_ROBO_RECEPTORS","SIGNALING_BY_NUCLEAR_RECEPTORS")
+ tab=getClustGSenrich(table,selectedGSs_only2nd,m_t2g_lists)
+kable(tab,digits = rep(1,nClust))
+```
+
+## GS common to both infections
+```{r genesetClusterRepresentation_common}
+selectedGSs_common=c("INTERLEUKIN_4_AND_INTERLEUKIN_13_SIGNALING",
+              "SIGNALING_BY_INTERLEUKINS","DNA_DOUBLE_STRAND_BREAK_REPAIR","DNA_REPAIR",
+              "FOXO_MEDIATED_TRANSCRIPTION",
+              "NEGATIVE_REGULATION_OF_MAPK_PATHWAY",
+              "CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM",
+              "INTERLEUKIN_10_SIGNALING",
+              "HDR_THROUGH_SINGLE_STRAND_ANNEALING_SSA_",
+              "FORMATION_OF_SENESCENCE_ASSOCIATED_HETEROCHROMATIN_FOCI_SAHF_")
+tab=getClustGSenrich(table,selectedGSs_common,m_t2g_lists)
+kable(tab,digits = rep(1,nClust))
+
+```
+
+```{bash searchSpecificGenes, include=FALSE, eval=FALSE}
+cd /Volumes/bioit/12891_Chevalier_EpiMemStrep/results_project12891/tables/
+# Count genes only 2nd in all clusters
+awk '$(NF-2)>.4 {print $2}' transcriptome/Sig_H3vsUI_D7vsUI_DIvsD7_DIvsH3_Annot.txt> id ; 
+grep -f id integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt|awk '{print $4,$(NF-12)}' | 
+sort -u | awk '{print $2}'|sort |uniq -c  
+
+# Find genes only 2nd in cluster 7
+awk '$(NF-2)>.4 {print $2}' transcriptome/Sig_H3vsUI_D7vsUI_DIvsD7_DIvsH3_Annot.txt> id
+grep -f id integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt|
+awk '$(NF-12)==7 {print $4}' | sort -u 
+
+rm id
+
+```
+
+
+# References
+
+```{r}
+sessionInfo()
+```
+
+
+```{r MFA_inputMatrixBenchmark, include=FALSE, eval=FALSE}
+ass=sample(row.names(mfa.EPITOME_distTh), 500, replace = FALSE, prob = NULL)
+xlim=c(-.6,.6) ; ylim=c(-.6,.6)
+
+title="TOME-METH_DIST-CORR-EPIG-CS-DEG-DMR"
+colInds=c(26:37,18:19,4,5,20:25,7,10:11,8:9)
+mfa.res_TOMEMETH_DISTCORREPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(3,4,5,6,7,8))
+
+title="TOME-METH-DIST-CORR_EPIG-CS-DEG-DMR"
+mfa.res_TOMEMETHDISTCORR_EPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),
+  num.group.sup=c(5,6,7,8))
+
+title="TOME-METH-DIST-CORR-EPIG_CS-DEG-DMR"
+mfa.res_TOMEMETHDISTCORREPIG_CSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6,7,8))
+
+title="TOME-METH-DIST-CORR-EPIG-CS_DEG-DMR"
+mfa.res_TOMEMETHDISTCORREPIGCS_DEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(7,8))
+
+title="TOME-METH-DIST-CORR-EPIG-CS-DEG_DMR"
+mfa.res_TOMEMETHDISTCORREPIGCSDEG_DMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(8))
+
+title="TOME-METH-DIST-CORR-EPIG-CS-DEG-DMR"
+mfa.res_TOMEMETHDISTCORREPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"))
+
+title="TOME-METH-DMR-DEG-DIST-CORR-EPIG_CS"
+mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh  <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, 
+  group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)),
+  name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6))
+
+results=list(TOMEMETH=mfa.res_TOMEMETH_DISTCORREPIGCSDEGDMR_distTh,
+     TOMEMETHDISTCORR=mfa.res_TOMEMETHDISTCORR_EPIGCSDEGDMR_distTh,
+     TOMEMETHDISTCORREPIG=mfa.res_TOMEMETHDISTCORREPIG_CSDEGDMR_distTh,
+     TOMEMETHDISTCORREPIGCS=mfa.res_TOMEMETHDISTCORREPIGCS_DEGDMR_distTh,
+     TOMEMETHDISTCORREPIGCSDEG=mfa.res_TOMEMETHDISTCORREPIGCSDEG_DMR_distTh,
+     TOMEMETHDISTCORREPIGCSDEGDMR=mfa.res_TOMEMETHDISTCORREPIGCSDEGDMR_distTh,
+     TOMEMETHDISTCORREPIGDEGDMR=mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh
+     )
+
+xlim=c(-1,1) ; ylim=c(-1,1)
+i=1
+for (mfa.res in results) {
+  title=names(results)[i] ; print(title)
+  print(kable(mfa.res[["eig"]][1:10,],caption = "Eigen values of global analysis"))
+  print(plot.MFA(mfa.res,choix="axes", axes=c(1,2),title=title))
+  print(plot.MFA(mfa.res,choix="var", axes=c(1,2),title=title))
+  print(plot.MFA(mfa.res,choix="var", axes=c(3,2),title=title))
+  print(plot.MFA(mfa.res,choix="var", axes=c(3,4),title=title))
+  plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2))
+  plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), xlim=xlim, ylim=ylim)
+  plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,2), xlim=xlim, ylim=ylim)
+  plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,4), xlim=xlim, ylim=ylim)
+  i=i+1
+}
+
+MFAstats(mfa.res,xlim,ylim)
+
+###################################
+###################################
+# OTHER INPUT matrices
+
+mfa.EPITOME_notDM_distTh=subset(EPITOME,(EPITOME$DM_kind!="DM" & EPITOME$DEG_kind!="notDE") &
+                   EPITOME$Distance<log(distTh+1,10))
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_distTh.RData")
+colInds=c(26:37,18:19,10:11,8:9)
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOMEMETH_distTh2 <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, 
+    group=c(12,2,2,2), type=c(rep("c",2),rep("n",2)),
+    name.group=c("TOME","METH","DEG","DMR"),
+    num.group.sup=c(3,4))
+save(mfa.res_TOMEMETH_distTh2,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG.RData")
+colInds=c(26:37,18:19,7,10:11,8:9)
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOMEMETH_EPIG <- MFA(mfa.EPITOME[,colInds], ncp=5, 
+    group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)),
+    name.group=c("TOME","METH","EPIG","DEG","DMR"),
+    num.group.sup=c(3,4,5))
+save(mfa.res_TOMEMETH_EPIG,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_distTh.RData")
+colInds=c(26:37,18:19,7,10:11,8:9)
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOMEMETH_EPIG_distTh <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, 
+    group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)),
+    name.group=c("TOME","METH","EPIG","DEG","DMR"),
+    num.group.sup=c(3,4,5))
+save(mfa.res_TOMEMETH_EPIG_distTh,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_notDM_distTh.RData")
+colInds=c(26:37,18:19,7,10:11,8:9)
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOMEMETH_EPIG_notDM_distTh <- MFA(mfa.EPITOME_notDM_distTh[,colInds], ncp=5, 
+    group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)),
+    name.group=c("TOME","METH","EPIG","DEG","DMR"),
+    num.group.sup=c(3,4,5))
+save(mfa.res_TOMEMETH_EPIG_notDM_distTh,file=fileOut)}
+
+
+```
+
+```{r MFAold, eval=FALSE, include=FALSE}
+
+selPeaks=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID),selPeaks_DEGs_nearest))
+selPeaks_nearest=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID),
+                  intersect(selPeaks_DEGs_nearest,selPeaksAllRep)))
+selPeaks_target=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID),
+                  intersect(selPeaks_DEGs_target,selPeaksAllRep)))
+nPeaks=length(selPeaks)
+
+selPeaks_nearest=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID),
+                  intersect(selPeaks_DEGs_nearest,selPeaksAllRep)))
+
+#### MFA 
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_nearest.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOMEMETH_EPIG_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(7:18,5,6,25:30)]), ncp=5, 
+    group=c(12,2,2,2,2), type=c(rep("c",2),rep("n",3)),
+    name.group=c("TOME","METH","EPIG","DM","DEG"),
+    num.group.sup=c(4,5))
+save(mfa.res_TOMEMETH_EPIG_nearest,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_target.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOMEMETH_EPIG_target <- MFA(na.omit(EPITOME[selPeaks_target,c(7:18,5,6,25:30)]), ncp=5, 
+    group=c(12,2,2,2,2), type=c(rep("c",2),rep("n",3)),
+    name.group=c("TOME","METH","EPIG","DM","DEG"),
+    num.group.sup=c(4,5))
+save(mfa.res_TOMEMETH_EPIG_target,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIGMETH_nearest.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_EPIGMETH_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(5,6,19:30)]), ncp=5, 
+    group=c(2,6,1,1,2,2), type=c(rep("c",2),rep("n",4)),
+    name.group=c("METH","EPIGENOME","LOC","CSkind","DM","DEG"),
+    num.group.sup=c(4))
+save(mfa.res_EPIGMETH_nearest,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIGMETH_target.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_EPIGMETH_target <- MFA(na.omit(EPITOME[selPeaks_target,c(5,6,19:30)]), ncp=5, 
+    group=c(2,6,1,1,2,2), type=c(rep("c",2),rep("n",4)),
+    name.group=c("METH","EPIGENOME","LOC","CSkind","DM","DEG"),
+    num.group.sup=c(4))
+save(mfa.res_EPIGMETH_target,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIG_nearest.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_EPIG_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(19:30)]), ncp=5, 
+    group=c(6,1,1,2,2), type=c(rep("c",1),rep("n",4)),
+    name.group=c("EPIGENOME","LOC","CSkind","METH","DEG"),
+    num.group.sup=c(3))
+save(mfa.res_EPIG_nearest,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIG_target.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_EPIG_target <- MFA(na.omit(EPITOME[selPeaks_target,c(19:30)]), ncp=5, 
+    group=c(6,1,1,2,2), type=c(rep("c",1),rep("n",4)),
+    name.group=c("EPIGENOME","LOC","CSkind","METH","DEG"),
+    num.group.sup=c(3))
+save(mfa.res_EPIG_target,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOME_nearest.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOME_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(7:18,25:30)]), ncp=5, 
+    group=c(12,1,1,2,2), type=c(rep("c",1),rep("n",4)),
+    name.group=c("TOME","LOC","CSkind","DM","DEG"),
+    num.group.sup=c(5))
+save(mfa.res_TOME_nearest,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOME_target.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_TOME_target <- MFA(na.omit(EPITOME[selPeaks_target,c(7:18,25:30)]), ncp=5, 
+    group=c(12,1,1,2,2), type=c(rep("c",1),rep("n",4)),
+    name.group=c("TOME","LOC","CSkind","DM","DEG"),
+    num.group.sup=c(5))
+save(mfa.res_TOME_target,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEonly.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+tmp=na.omit(EPITOME[selPeaks_nearest,c(7:18,29:30)])
+tmp=subset(tmp,tmp$DEG_sign!="DownUp" & tmp$DEG_sign!="UpDown")
+mfa.res_TOMEonly <- MFA(tmp, ncp=5, 
+    group=c(12,2), type=c(rep("c",1),rep("n",1)),
+    name.group=c("TOME","DEG"))
+save(mfa.res_TOMEonly,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_METHOME_nearest.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_METHOME_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(1:4,25:30)]), ncp=5, 
+    group=c(4,1,1,2,2), type=c(rep("c",1),rep("n",4)),
+    name.group=c("METHOME","LOC","CSkind","DM","DEG"),
+    num.group.sup=c(4))
+save(mfa.res_METHOME_nearest,file=fileOut)}
+
+fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_METHOME_target.RData")
+if (file.exists(fileOut)) {load(fileOut)} else {
+mfa.res_METHOME_target <- MFA(na.omit(EPITOME[selPeaks_target,c(1:4,25:30)]), ncp=5, 
+    group=c(4,1,1,2,2), type=c(rep("c",1),rep("n",4)),
+    name.group=c("METHOME","LOC","CSkind","DM","DEG"),
+    num.group.sup=c(4))
+save(mfa.res_METHOME_target,file=fileOut)}
+
+
+mfa.res=mfa.res_TOMEMETH_EPIG_nearest
+mfa.res=mfa.res_EPIG_nearest
+mfa.res=mfa.res_EPIGMETH_nearest
+mfa.res=mfa.res_TOME_nearest
+mfa.res=mfa.res_TOMEonly ; nClust=8
+mfa.res=mfa.res_METHOME_nearest
+
+MFAstats(mfa.res)
+  
+
+#### Clustering
+
+hcpc.res <- HCPC(mfa.res, nb.clust=nClust, conso=0, min=3, max=10,
+                 metric = "euclidean")
+
+clusterStats(hcpc.res)
+
+# d=dimdesc(mfa.res)
+# 
+# #### Confidence ellipses around categories per variable
+# plotellipses(mfa.res,axes = c(2,3), means=FALSE)
+# plotellipses(mfa.res,axes = c(2,3),
+#              keepvar="DEG_kind",label = "none") ## for 1 variable
+
+```
+
+```{r}
+plot <- function(expected,obs,nClust) {
+  ggplot_df=rbind(as.data.frame(expected),as.data.frame(obs))
+  ggplot_df$Set=c(rep("All",nClust),rep("Subset",nClust))
+  colnames(ggplot_df)[1:2]=c("MFAcluster","Freq")
+  
+  p <- ggplot(data=ggplot_df, aes(x=Set, y=Freq, fill=MFAcluster)) +
+  geom_bar(stat="identity")+
+  scale_fill_brewer(palette="Paired")+
+  theme_minimal() + ggtitle(selectedGS)
+  print(p)
+
+}
+
+```
+
-- 
GitLab