From a2349e304a1e7168ff316928548d2c93a5d97499 Mon Sep 17 00:00:00 2001 From: Claudia CHICA <claudia.chica@pasteur.fr> Date: Fri, 20 May 2022 18:13:54 +0200 Subject: [PATCH] Update Wednesday/MAF/EPITOME_CS_integration.Rmd --- Wednesday/MAF/EPITOME_CS_integration.Rmd | 1107 ++++++++++++++++++++++ 1 file changed, 1107 insertions(+) create mode 100644 Wednesday/MAF/EPITOME_CS_integration.Rmd diff --git a/Wednesday/MAF/EPITOME_CS_integration.Rmd b/Wednesday/MAF/EPITOME_CS_integration.Rmd new file mode 100644 index 0000000..09d0d10 --- /dev/null +++ b/Wednesday/MAF/EPITOME_CS_integration.Rmd @@ -0,0 +1,1107 @@ +--- +title: "Epigenomic memory of infection: Data integration" +subtitle : "H3K4me2, transcriptome time courses and chromatin state" +author: "Claudia Chica" +date: "02.2021" +output: + html_document: + keep_md: yes + number_sections: yes + smart: no + toc: yes + toc_float: yes +editor_options: + chunk_output_type: console +--- + +```{r setup, message = FALSE, warning = FALSE, echo = FALSE} +knitr::opts_chunk$set(message=FALSE,warning=FALSE,echo=FALSE)#, cache=TRUE, cache.lazy = FALSE) +knitr::opts_knit$set(progress = FALSE, verbose = FALSE) +#knitr::opts_chunk$set(dev = "svg") + +WDIR="/Volumes/bioit/12891_Chevalier_EpiMemStrep" + +library(ggplot2) +library(plyranges) +library(RColorBrewer) +library(knitr) +library(pheatmap) +library(patchwork) +library(FactoMineR) +library(org.Hs.eg.db) +library(msigdbr) + + +``` + +# Methylation profile + +```{r loadEPIGresults} +# Load epigenomic counts +mark="H3K4me2" +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/countNormLength_",mark,".RData") +load(fileOut) + +# Discard ONE peak with negative normalised values in a SINGLE replicate (artefact from cycliclowess normalisation) +countNormLength_H3K4me2=countNormLength_H3K4me2[countNormLength_H3K4me2$norm.H3K4me2_UI_REP1>0,] + +# Get list of DM peaks +files=list.files(path = paste0(WDIR,"/results_project12891/tables/tables_v3_UCSC/",mark,"/"),pattern = "resAnDif_with_annotation_sorted.final.xls", full.names = T) +DM_results=list() ; i=1 +for (fileDM in files) { + no_col <- max(count.fields(fileDM, sep = "\t")) + tmp=read.table(fileDM,header = F,sep="\t", fill = TRUE,skip = 1, + col.names=1:no_col,colClasses = c("character",rep("numeric",16),rep("character",no_col-17))) + row.names(tmp) <- tmp[,1] + tmp=tmp[,14:17] + colnames(tmp) <- c("baseMean", "log2FoldChange", "pvalue", "padj") + tmp=tmp[row.names(countNormLength_H3K4me2),] + DM_results[[i]]=tmp ; i=i+1 +} +rm(tmp) +names(DM_results) <- c("D7vsH3","D7vsUI","H3vsUI") + +# DM profiles +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/DMpeaks_",mark,".RData") +load(fileOut) + +# Add not DEGs +DMpeaks_H3K4me2=intersect(DMpeaks_H3K4me2,row.names(DM_results$D7vsUI)) +tmp=setdiff(row.names(countNormLength_H3K4me2),DMpeaks_H3K4me2) +DMs=data.frame(peakID=DMpeaks_H3K4me2, + Kind="DM", +Sign=ifelse(DM_results$D7vsUI[DMpeaks_H3K4me2,]$log2FoldChange>=0,"Gain","Loss")) +notDMs=data.frame(peakID=tmp, + Kind="notDM", +Sign=ifelse(DM_results$D7vsUI[tmp,]$log2FoldChange>=0,"Gain","Loss")) +DMs=rbind(DMs,notDMs) +DMs$peakID=as.character(DMs$peakID) +rm(tmp,notDMs) + +kable(table(DMs$Kind,DMs$Sign)) +``` + +# Transcriptional profile +```{r loadTOMEresults} +# Expression data normalised and batch corrected +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/Data_RMAandNoBatch_Annot.RData") +load(fileOut) + +# DA results, same object used for the functional analysis +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/DE_results_Annot.RData") +load(fileOut) + +SYMBOL2ENTREZ=unique(data.frame(SYMBOL=as.character(DE_results$H3vsUI$SYMBOL), + ENTREZID=as.character(DE_results$H3vsUI$ENTREZID),stringsAsFactors = FALSE)) + +# DEGs padjTh=.05 ; lfcTh=0 +fileOut=paste0(WDIR,"/results_project12891/tables/transcriptome/DEGs_H3K4me2profiles_CompleteAnnotationPeaks_nearestGene.txt") +DEGs=read.table(fileOut,header = T)[,7:10] +DEGs=unique(na.omit(DEGs)) +DEGs$SYMBOL=as.character(DEGs$SYMBOL) +DEGs$ENTREZID=as.character(DEGs$ENTREZID) +# Add not DEGs +tmp=setdiff(SYMBOL2ENTREZ$ENTREZID,DEGs$ENTREZID) +H3vsUI=DE_results$H3vsUI[!duplicated(DE_results$H3vsUI$ENTREZID),] +DIvsD7=DE_results$DIvsD7[!duplicated(DE_results$DIvsD7$ENTREZID),] +notDEGs=data.frame(ENTREZID=SYMBOL2ENTREZ[SYMBOL2ENTREZ$ENTREZID %in% tmp,]$ENTREZID, + SYMBOL=SYMBOL2ENTREZ[SYMBOL2ENTREZ$ENTREZID %in% tmp,]$SYMBOL, + Kind="notDE", +# Sign=ifelse(H3vsUI[H3vsUI$ENTREZID %in% tmp,]$logFC>=0, +# ifelse(DIvsD7[DIvsD7$ENTREZID %in% tmp,]$logFC>=0,"Up","UpDown"), +# ifelse(DIvsD7[DIvsD7$ENTREZID %in% tmp,]$logFC<0,"Down","DownUp"))) +Sign=ifelse(H3vsUI[H3vsUI$ENTREZID %in% tmp,]$logFC>=0,"Up","Down")) +DEGs=rbind(DEGs,notDEGs) +DEGs$SYMBOL=as.character(DEGs$SYMBOL) +DEGs$ENTREZID=as.character(DEGs$ENTREZID) +rm(tmp,H3vsUI,DIvsD7,notDEGs) + +kable(table(DEGs$Kind,DEGs$Sign)) + +``` + +```{r load_peaks2Gene} +mark="H3K4me2" +### Nearest gene +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mergedPeaks_annotNearest_",mark,".RData") +load(fileOut) +peaks2GeneNearest_H3K4me2=data.frame( + peakId=mergedPeaks_annotNearest_H3K4me2@ranges@NAMES, + SYMBOL=as.character(mergedPeaks_annotNearest_H3K4me2@elementMetadata@listData[["mcols.GENESYMBOL"]]), + distance=mergedPeaks_annotNearest_H3K4me2@elementMetadata@listData[["distance"]], + stringsAsFactors=FALSE) +peaks2GeneNearest_H3K4me2=merge(peaks2GeneNearest_H3K4me2,SYMBOL2ENTREZ,by.x=2,by.y=1,all.x=TRUE)[,c(2,1,3,4)] + +### Target gene from MEME +# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mergedPeaks_annotTgene_",mark,".RData") +# load(fileOut) +# peaks2GeneTgene_H3K4me2=data.frame(peakId=mergedPeaks_annotTgene_H3K4me2$peakId, +# SYMBOL=as.character(mergedPeaks_annotTgene_H3K4me2$Gene_Name), +# ENTREZID=mergedPeaks_annotTgene_H3K4me2$ENTREZID, +# distance=0,stringsAsFactors=FALSE) + +# Load ENTREZ ENSEMBL ids mapping to cross Tgene and EPI-TOME information +ENSEMBLEids <- org.Hs.egENSEMBL +mapped_genes <- mappedkeys(ENSEMBLEids) +ENTREZtoENSEMBL <- as.data.frame(ENSEMBLEids[mapped_genes]) +colnames(ENTREZtoENSEMBL)=c("ENTREZID","ENSEMBLEID") + +# Load Tgene MEME results +fileIn=paste0(WDIR,"/results_project12891/MEME_Tgene/H2K4me2/links.tsv") +mergedPeaks_Tgene=read.table(fileIn,header = T,sep = "\t") +## Get only significant associations peak-gene +pvalTh=.05 #; FDRth=.5 +mergedPeaks_TgeneSig=subset(mergedPeaks_Tgene,mergedPeaks_Tgene$CnD_P_Value < pvalTh) +mergedPeaks_TgeneSig$chr=gsub("chr","",gsub(":[0-9-]*","",mergedPeaks_TgeneSig$RE_Locus)) +mergedPeaks_TgeneSig$start=as.character(as.integer(gsub("chr[0-9XY]*:","",gsub("-[0-9]*","",mergedPeaks_TgeneSig$RE_Locus)))-1) +mergedPeaks_TgeneSig$end=gsub(".*-","",mergedPeaks_TgeneSig$RE_Locus) +mergedPeaks_TgeneSig$peakID=apply(mergedPeaks_TgeneSig[,18:20],1,function(x){paste0(x[1],"_",x[2],"_",x[3])}) +mergedPeaks_TgeneSig$ENSEMBL=gsub("\\.[0-9-]*","",mergedPeaks_TgeneSig$Gene_ID) +peaks2GeneTgene_H3K4me2=na.omit(merge(mergedPeaks_TgeneSig, + ENTREZtoENSEMBL,by.x=22,by.y=2,all.x=TRUE)[,c(22,23,10,14,17)]) + +# Select unique gene-region associations +# select association minimun Tgene pvalue +peaks2GeneTgene_H3K4me2$peak2gene=paste0(peaks2GeneTgene_H3K4me2$peakID,"_", + peaks2GeneTgene_H3K4me2$ENTREZID) +peaks2GeneTgene_H3K4me2$tmp=paste0(peaks2GeneTgene_H3K4me2$peakID,"_", + peaks2GeneTgene_H3K4me2$ENTREZID,"_",as.character(peaks2GeneTgene_H3K4me2$CnD_P_Value)) +peaks2GeneTgene_H3K4me2=peaks2GeneTgene_H3K4me2[!duplicated(peaks2GeneTgene_H3K4me2$tmp),] + +tmp=aggregate(peaks2GeneTgene_H3K4me2$CnD_P_Value,list(peaks2GeneTgene_H3K4me2$peak2gene),min) +tmp$tmp=paste0(tmp$Group.1,"_",as.character(tmp$x)) +peaks2GeneTgene_H3K4me2=peaks2GeneTgene_H3K4me2[peaks2GeneTgene_H3K4me2$tmp %in% tmp$tmp,-7] + +rm(mergedPeaks_TgeneSig,ENSEMBLEids,mapped_genes,ENTREZtoENSEMBL) +``` + +# Chromatin states + +```{r selectPeaks, eval=FALSE, include=FALSE} +# Select peaks using counts +logFCthUp_H3K4me2=.005 #quantile .75 +logFCthDown_H3K4me2=-.005 #quantile .25 +tmp=countNormLength_H3K4me2 +selPeaksAllRep=row.names( + subset(tmp,( + (log(tmp$norm.H3K4me2_D7_REP1/tmp$norm.H3K4me2_UI_REP1,2) > logFCthUp_H3K4me2 & + log(tmp$norm.H3K4me2_D7_REP2/tmp$norm.H3K4me2_UI_REP2,2) > logFCthUp_H3K4me2) | + (log(tmp$norm.H3K4me2_D7_REP1/tmp$norm.H3K4me2_UI_REP1,2) < logFCthDown_H3K4me2 & + log(tmp$norm.H3K4me2_D7_REP2/tmp$norm.H3K4me2_UI_REP2,2) < logFCthDown_H3K4me2)))) + +selPeaksAll=row.names(tmp) + +# Select peaks related to DEGs +selPeaks_DEGs_nearest=unique(merge(DEGs[DEGs$Kind!="notDE",],peaks2GeneNearest_H3K4me2[,c(4,1)],by=1)[,5]) +selPeaks_DEGs_target=unique(merge(DEGs[DEGs$Kind!="notDE",],peaks2GeneTgene_H3K4me2[,c(3,1)],by=1)[,5]) + +print(c("Number of:")) +print(c("Total peaks",dim(tmp)[1])) +print(c("Selected peaks - methylation logFold",length(selPeaksAll))) +print(c("Selected peaks - reproducible methylation logFold",length(selPeaksAllRep))) +print(c("Selected peaks - nearest to DEG",length(selPeaks_DEGs_nearest))) +print(c("Selected peaks - with DEG target",length(selPeaks_DEGs_target))) + +print(c("Methylation logFold nearest to DEG",length(intersect(selPeaksAll,selPeaks_DEGs_nearest)))) +print(c("Methylation logFold with DEG target",length(intersect(selPeaksAll,selPeaks_DEGs_target)))) +print(c("Reproducible methylation logFold nearest to DEG",length(intersect(selPeaksAllRep,selPeaks_DEGs_nearest)))) +print(c("Reproducible methylation logFold with DEG target",length(intersect(selPeaksAllRep,selPeaks_DEGs_target)))) + +ggplotdf=data.frame(log2FC=c( + DM_results$D7vsUI[selPeaksAll,]$log2FoldChange, + DM_results$D7vsUI[selPeaksAllRep,]$log2FoldChange, + DM_results$D7vsUI[selPeaks_DEGs_nearest,]$log2FoldChange, + DM_results$D7vsUI[selPeaks_DEGs_target,]$log2FoldChange), + peakPopulation=factor(c(rep("All peaks",length(selPeaksAll)), + rep("Reproducible Log2FC",length(selPeaksAllRep)), + rep("DEGs nearest",length(selPeaks_DEGs_nearest)), + rep("DEGs target",length(selPeaks_DEGs_target))), + levels = c("All peaks","Reproducible Log2FC","DEGs nearest","DEGs target"))) + +ggplot(ggplotdf, aes(x=log2FC,fill=peakPopulation)) + + geom_density(alpha=0.5) + facet_grid(~ peakPopulation) +``` + +```{r get_mergedPeaks_annotTXS} +# Load all merged peaks coords +fileIn=paste0(WDIR,"/results_project12891/tables/tables_v3_UCSC/",mark,"/",mark,"_mergedPeaks.bed") +mergedPeaks=read_bed(fileIn) + +# Annotate merged peaks +fileAnnot=paste0(WDIR,"/results_project12891/igv/igv_session_v3/annotation/hg19_transcripts.geneSymbol.reformat.bed") +txs_gr <- read_bed(fileAnnot) +tss_gr <- GRanges(seqnames = txs_gr@seqnames, + ranges = IRanges(start = txs_gr@ranges@start-1e3, end = txs_gr@ranges@start+1e3), + strand = txs_gr@strand, + mcols = data.frame(GENESYMBOL=mcols(txs_gr)$name)) +txs_gr <- GRanges(seqnames = txs_gr@seqnames, + ranges = txs_gr@ranges, + strand = txs_gr@strand, + mcols = data.frame(GENESYMBOL=mcols(txs_gr)$name)) + +mergedPeaks_annotTXS=array(NA,dim=length(mergedPeaks)) ; names(mergedPeaks_annotTXS)=mergedPeaks$name +ovtss=unique(findOverlaps(mergedPeaks,tss_gr)@from) +mergedPeaks_annotTXS[ovtss]="TSS" +ovintraG=setdiff(unique(findOverlaps(mergedPeaks,txs_gr)@from),ovtss) +mergedPeaks_annotTXS[ovintraG]="intraG" +mergedPeaks_annotTXS[is.na(mergedPeaks_annotTXS)]="interG" +``` + +```{r annotationQuantification} +DIR=paste0(WDIR,"/results_project12891/Encode_data/bigWig/") +fileMetadata=paste0(DIR,"ENCODE_metadata_bigWig_foldChangeOverControl.released.txt") +Encode_metadata <- read.table(file=fileMetadata,header=TRUE,sep="\t") + +marks=c("H3K4me3", "H3K4me2", "H3K27ac", "H3K4me1", "H3K79me2", "H3K27me3") +AnnotationCols <- data.frame( + Mark=factor(sub("-human","",Encode_metadata$Experiment.target),levels = marks), + row.names = Encode_metadata$File.accession) + +colsG=brewer.pal(n = 3, name = "Greys") +colsM=brewer.pal(n = 4, name = "Greens") +colsS=brewer.pal(n = 3, name = "Blues")[2:3] +nClust=8 ; colsC=brewer.pal(n = nClust, name = "Set2") +colsAnnot <- list(Mark=c("H3K4me3"=colsS[1], "H3K4me2"=colsS[2], "H3K27ac"=colsM[1], "H3K4me1"=colsM[2], "H3K79me2"=colsM[3], "H3K27me3"=colsM[4]), + DM_sign=c("Gain"="#FF0000", "Loss"="#FF000030"), + DM_kind=c("DM"="#000000", "notDM"="transparent"), + Localization=c("TSS"=colsG[1], "intraG"=colsG[2], "interG"=colsG[3]), + CS=c("1"=colsC[1],"2"=colsC[2],"3"=colsC[3],"4"=colsC[4],"5"=colsC[5],"6"=colsC[6],"7"=colsC[7],"8"=colsC[8])) +``` + +```{bash getMtx_ENCODElogchange, eval=FALSE, include=FALSE} +WDIR=/pasteur/projets/policy01/BioIT/12891_Chevalier_EpiMemStrep/results_project12891/ +module load UCSC-tools/v373 +module load bedtools/2.25.0 + +# 1. Download bigWig files +# 2. Get bedGraph files +cd Encode_data/bigWig/ +for file in *.bigWig ; do +if [ ! -f ${file/bigWig/bedGraph} ] ; then echo $file ; +srun bigWigToBedGraph $file ${file/bigWig/bedGraph} & fi ; done + +# 3. Overlap with mergedPeaks +mark=H3K4me2 +filePeaks=$WDIR/tables/tables_v3_UCSC/*/${mark}_mergedPeaks.bed +cd $WDIR/Encode_data/bigWig + +# 3.1. Small files +emacs overlapBedGraph_mergedPeaks.sh +#!/bin/sh +i=$(head -n ${SLURM_ARRAY_TASK_ID} $1 | tail -n 1) +WDIR=$2 +filePeaks=$3 +mark=$4 + +if [ ! -f $WDIR/${i/.bedGraph/_mergedPeaks_${mark}.ov} ] ; then +srun -o $i.ovb -e $i.e intersectBed -a $filePeaks -b $i -wb +awk '{s[$4]=s[$4]+$NF ; n[$4]=n[$4]+1} END {for (p in n) {print p"\t"s[p]/n[p]}}' $i.ovb | +sort -k1,1 > $WDIR/${i/.bedGraph/_mergedPeaks_${mark}.ov} ; fi + +ls *bedGraph > fileList.txt +n=`wc -l fileList.txt | awk '{print $1}'` +sbatch -p hubbioit --qos hubbioit -o ov.o -e ov.e --mem=200G --array=1-$n%$n overlapBedGraph_mergedPeaks.sh fileList.txt $WDIR/Encode_data/bigWig/ $filePeaks $mark + +# 4 Assemble matrix +echo "Peak_name" > header0 ; i=0 +for file in *$mark*ov ; do let j=$i+1 +echo ${file/_mergedPeaks_${mark}.ov} | paste header$i - > header$j ; ((i++)) ; done +paste *$mark*ov | awk '{line=$1 ; for (i=2;i<=NF;i=i+2) {line=line"\t"$i} ; print line}' | +cat header$j - > ENCODE_mergedPeaks_${mark}.ov + +# Clean +rm $WDIR/Encode_data/bigWig/*.ovb $WDIR/Encode_data/bigWig/*.bedGraph +rm $WDIR/Encode_data/bigWig/*.e $WDIR/Encode_data/bigWig/ov.o +rm header* +``` + +```{r ENCODE_bigWig} +DIR=paste0(WDIR,"/results_project12891/Encode_data/bigWig/") +fileSignal=paste0(DIR,"ENCODE_mergedPeaks_",mark,".ov") +matSignal <- read.table(file=fileSignal,header=TRUE,row.names = 1,sep="\t") + +markOrder=c(#H3K4me3 + "ENCFF628ANV", "ENCFF510LTC", "ENCFF556OVF", + #"ENCFF313GRK", "ENCFF709KWY", "ENCFF635VEW", + "ENCFF250OYQ", "ENCFF189JCB", "ENCFF421ZNH", + #H3K4me2 + "ENCFF876DTJ", "ENCFF975YWP", "ENCFF419LFZ", + #H3K4me1 + "ENCFF165ZPD", "ENCFF569ZJQ", "ENCFF613NHX", + #H3K27ac + #"ENCFF216GUZ", "ENCFF597SRK", "ENCFF562LEK", + "ENCFF137KNW", "ENCFF103BLQ", "ENCFF663ILW", + #H3K79me2 + "ENCFF892LYD", "ENCFF154HXA", "ENCFF931LYX", + #H3K27me3 + "ENCFF819QSK", "ENCFF134YLO", "ENCFF336AWS") +``` + +```{r ENCODE_stats} +ENCODE_stats <- function(matSignalSel,selPeaks,mergedPeaks_annotTXS,DMs) { +allZero=which(apply(matSignalSel,1,sum)==0) +if (length(allZero)>0) matSignalSel=matSignalSel[-allZero,] + +mtx=t(matSignalSel) +d=dist(t(scale(mtx))) +hc=hclust(d,method = "ward.D2") +nClust=8 +clust=cutree(hc,k=nClust) + +selPeaks=intersect(selPeaks,row.names(matSignalSel)) + +# Complete AnnotationRows +AnnotationRows <- data.frame( + Localization=factor(mergedPeaks_annotTXS[selPeaks],levels = c("TSS","intraG","interG")), + CS=factor(clust[selPeaks]), + row.names = selPeaks +) +AnnotationRows=merge(AnnotationRows,DMs,by.x=0,by.y=1) +row.names(AnnotationRows)=AnnotationRows$Row.names ; AnnotationRows=AnnotationRows[,-1] +colnames(AnnotationRows)[3:4]=c("DM_kind","DM_sign") + +return(AnnotationRows) +} +``` + +```{r ENCODE_heatmap, fig.height=10, fig.width=8} +ENCODE_heatmap <- function(matSignalSel,AnnotationRows,colsAnnot) { +breaks=seq(-4,4,.1) ; cols=colorRampPalette(c("purple","black","yellow"))(length(breaks)+1) + +allZero=which(apply(matSignalSel,1,sum)==0) +if (length(allZero)>0) matSignalSel=matSignalSel[-allZero,] + +pheatmap(matSignalSel, + scale = "row", cluster_rows = T, cluster_cols = F, + clustering_method = "ward.D2", + show_rownames = F, show_colnames = F, + annotation_col = subset(AnnotationCols,row.names(AnnotationCols) %in% markOrder), + annotation_row = AnnotationRows, + annotation_colors = colsAnnot, + color = cols, border = NA, + #main = paste(mark,"DM peaks",length(DMpeaks)), + fontsize = 12) +} +``` + +## All H3K4me2 peaks + +```{r ENCODE_counts, fig.height=10, fig.width=8, cache=TRUE} +selPeaksAll=row.names(countNormLength_H3K4me2) +selPeaks=selPeaksAll +matSignalSel=na.omit(matSignal[selPeaks,markOrder]) +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/AnnotationRows_allPeaks.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +AnnotationRows=ENCODE_stats(matSignalSel,selPeaks,mergedPeaks_annotTXS,DMs) +AnnotationRows_allPeaks=AnnotationRows +save(AnnotationRows_allPeaks,file=fileOut)} + +# Tables +AnnotationRows=AnnotationRows_allPeaks +t=table(AnnotationRows$CS) +kable(ceiling(t/sum(t)*100),col.names = c("CS","Freq")) + +t1=table(AnnotationRows$Localization) +tt=table(AnnotationRows$CS,AnnotationRows$Localization) +kable(ceiling(t1/sum(t1)*100),col.names = c("Localization","Freq")) +kable(ceiling(tt/as.numeric(t)*100),row.names = T) +# +# t1=table(AnnotationRows$DM_sign) +# kable(ceiling(t1/sum(t1)*100),col.names = c("H3K4me2 Profile","Freq")) +# tt=table(AnnotationRows$CS,AnnotationRows$DM_sign) +# kable(ceiling(tt/as.numeric(t)*100),row.names = T) +# +# t1=table(AnnotationRows$DM_kind) +# kable(ceiling(t1/sum(t1)*100),col.names = c("H3K4me2 Dynamic","Freq")) +# tt=table(AnnotationRows$CS,AnnotationRows$DM_kind) +# kable(ceiling(tt/as.numeric(t)*100),row.names = T) + +ENCODE_heatmap(matSignalSel,AnnotationRows,colsAnnot) +``` + +# Multi factorial analysis + +Multi factorial analysis (MFA) (Escofier & Pages, Computational Statistics & Data Analysis, 1994) studies several groups of variables (numerical and/or categorical) defined on the same set of individuals. MFA is a factor analysis applied to the array including all groups of variables. Roughly, the behaviour of the method is equivalent to PCA (concerning quantitative variables) or to MCA (concerning qualitative variables). + + +```{r MFA_functions} +#http://juliejosse.com/wp-content/uploads/2019/03/MFA_staf2015.pdf + +individualContributionPlot <- function(mfa.res,nPeaks) { +contribs=data.frame() +for (i in 1:5) { +rs=mfa.res[["ind"]][["contrib"]][order(mfa.res[["ind"]][["contrib"]][,i],decreasing = TRUE),i] %>% cumsum() +peakId=names(rs) +contribs=rbind(contribs,data.frame(peakId,seq(1,length(rs),1),as.vector(rs),rep(i,length(rs))))} +colnames(contribs)=c("peakId","Rank","RankedSum","Dimension") +contribs$Dimension=factor(contribs$Dimension) + +p <- ggplot(contribs,aes(x=Rank,y=RankedSum)) + geom_point(aes(color=Dimension)) + +print(p) + +return(contribs) +} + +MFAstats <- function(mfa.res,xlim,ylim) { +# Eigen values of separate analysis +# Describe dimensionality of each variable group +#print(kable(sapply(mfa.res[["separate.analyses"]],"[[", 1), +# caption = "Eigen values of separate analysis")) +# Eigen values of global analysis +# How many components are necessary to describe the total inertia of the dataset +print(kable(mfa.res[["eig"]][1:10,], + caption = "Eigen values of global analysis")) +# Correlation coeffs +# How much of the stt defined by component C depends on groups(s) G : G -> C +# Fundamental to see if there are structures COMMON to groups -> useful to integrate +print(kable(mfa.res[["group"]][["correlation"]],caption = "Group correlation with factor")) +# Inertia or group contribution +# How much of the inertia of group G is explained by component C : C -> G +print(kable(mfa.res[["group"]][["cos2"]],caption = "Group inertia per factor")) +# Group contribution +plotGrouprepresentation(mfa.res) +# Invidual contribution to each component +# Determine wheter an axis is due to only some individuals +#mfa.res[["ind"]][["contrib"]] +#individualContributions=individualContributionPlot(mfa.res,nPeaks) +# Categorical variable representation (centroid of inds ) +# Determine how are +#dimdesc(mfa.res)$Dim.2 +#splotIndrepresentation(mfa.res,xlim,ylim) +# Variable contribution +plotVarrepresentation(mfa.res) +} + +plotGrouprepresentation <- function(mfa.res) { +p1 <- plot.MFA(mfa.res,choix = "group",axes = c(1,2), title = "") +p2 <- plot.MFA(mfa.res,choix = "group",axes = c(1,3), title = "") +p3 <- plot.MFA(mfa.res,choix = "group",axes = c(1,4), title = "") +p4 <- plot.MFA(mfa.res,choix = "group",axes = c(1,5), title = "") +print((p1 | p2) / (p3 | p4)) +} + +plotIndrepresentation <- function(mfa.res,xlim,ylim) { + if (ylim!="" & xlim!="") { +p1 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,2), + title = "",ylim=ylim,xlim=xlim) +p2 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,3), + title = "",ylim=ylim,xlim=xlim) +p3 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,4), + title = "",ylim=ylim,xlim=xlim) +p4 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,5), + title = "",ylim=ylim,xlim=xlim) } else +{ +p1 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,2), title = "") +p2 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,3), title = "") +p3 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,4), title = "") +p4 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,5), title = "") + +} +print((p1 | p2) / (p3 | p4)) +} + +plotVarrepresentation <- function(mfa.res) { + title="" + p1 <- plot.MFA(mfa.res,choix="var", axes=c(1,2),title=title,cex=.7) + p2 <- plot.MFA(mfa.res,choix="var", axes=c(1,3),title=title,cex=.7) + p3 <- plot.MFA(mfa.res,choix="var", axes=c(1,4),title=title,cex=.7) + p4 <- plot.MFA(mfa.res,choix="var", axes=c(1,5),title=title,cex=.7) + +print((p1 | p2) / (p3 | p4) + plot_layout(guides = "collect")) + +} + +clusterStats <- function(hcpc.res) { +### Cluster description +## By variables: +# Test if variable (OMIC samples) values distribution per cluster is random or not +# For quantitative variables +print(kable(hcpc.res[["desc.var"]]$quanti,caption = "Description by quantitative variables")) +# For qualitative variables +print(kable(hcpc.res[["desc.var"]][["category"]],caption = "Description by qualitative variables")) +## By components +# Test if individuals coordinates distribution per cluster is random or not +# For quantitative variables +print(kable(hcpc.res[["desc.axes"]][["quanti"]], + caption = "Description by factors (quantitative variables)")) +# For qualitative variables +print(kable(hcpc.res[["desc.var"]][["test.chi2"]], + caption = "Description by factors (qualitative variables)")) +## By individuals (genomic regions) +# Nearest individuals to the cluster's centroid +print("Description by individuals (nearest):") +print(hcpc.res[["desc.ind"]]$para) +# Furthest individuals to the cluster's centroid +print("Description by individuals (furthest):") +print(hcpc.res[["desc.ind"]]$dist) +} + +``` + +```{r MFA_prepareMtx} +# Differential analysis info +AnnotationRows=AnnotationRows_allPeaks[,2:4] +#DEGs2peaks=merge(DEGs,peaks2GeneTgene_H3K4me2[,c(3,1)],by=1) +#AnnotationRows=merge(AnnotationRows,DEGs2peaks[,c(5,4,3,1)],by.x=0,by.y=1) +DEGs2peaks=merge(DEGs,peaks2GeneTgene_H3K4me2,by.x=1,by.y=2) +AnnotationRows=merge(AnnotationRows,DEGs2peaks[,c(5,3,4,1,2,6:8)],by.x=0,by.y=1) +AnnotationRows$Distance=log(abs(AnnotationRows$Distance)+1,10) +#AnnotationRows=AnnotationRows[!duplicated(AnnotationRows[,1]),] +#AnnotationRows <- AnnotationRows[,-c(1,8)] ; +colnames(AnnotationRows)[5:6] <- c("DEG_kind","DEG_sign") +AnnotationRows$DEG_kind=factor(AnnotationRows$DEG_kind,levels = c("common","only_H3vsUI","only_DIvsD7","notDE")) +#AnnotationRows$DEG_sign=factor(AnnotationRows$DEG_sign,levels = c("Up","Down","UpDown","DownUp")) +AnnotationRows$DEG_sign=factor(AnnotationRows$DEG_sign,levels = c("Up","Down")) +#Re-order columns +AnnotationRows=AnnotationRows[,c(1,7:11,2:6)] + +# Merge methylation, transcription and epigenome signal +METHOME=merge(t(scale(t(countNormLength_H3K4me2))), + DM_results$D7vsUI[,1:2],by=0) +colnames(METHOME)[8:9]=c("H3K4me2_log2baseMean","H3K4me2_D7vsUI_log2FC") +METHOME$H3K4me2_log2baseMean=log(METHOME$H3K4me2_log2baseMean,2) + +# Merge TOME data +Data=Data_RMA_NoBatchComBat_Annot +t=na.omit(Data[!duplicated(Data$ENTREZID),])[,c(1,5,9,2,6,10,3,7,11,4,8,12)] +TOME=log(t,2) +row.names(TOME)=na.omit(Data[!duplicated(Data$ENTREZID),14]) + +# Merge ENCODE data +samples=AnnotationCols[markOrder,] +names(samples)=markOrder +ENCODEmean=t(scale(t( + data.frame( + H3K4me3=apply(matSignal[,names(samples[samples=="H3K4me3"])],1,function(x) mean((x))), + H3K4me2=apply(matSignal[,names(samples[samples=="H3K4me2"])],1,function(x) mean((x))), + H3K4me1=apply(matSignal[,names(samples[samples=="H3K4me1"])],1,function(x) mean((x))), + H3K27ac=apply(matSignal[,names(samples[samples=="H3K27ac"])],1,function(x) mean((x))), + H3K79me2=apply(matSignal[,names(samples[samples=="H3K79me2"])],1,function(x) mean((x))), + H3K27me3=apply(matSignal[,names(samples[samples=="H3K27me3"])],1,function(x) mean((x))), + row.names = row.names(matSignal)) +))) + +# EPITOME all +EPITOME=merge(AnnotationRows,METHOME,by=1) +EPITOME=merge(EPITOME,ENCODEmean,by.x=1,by.y=0) +EPITOME=merge(EPITOME,TOME,by.x=2,by.y=0) +row.names(EPITOME)=paste0(EPITOME$Row.names,"_",EPITOME$ENTREZID) +colnames(EPITOME)[2]="peakID" +``` + +```{r MFA_getMtx1} + +#### Get INPUT matrix + +# Select all peak2gene associations where there is a DMR or a DEG +mfa.EPITOME=subset(EPITOME,EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") +corrTh=.4 +mfa.EPITOME_corrTh=subset(EPITOME,(EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") & + abs(EPITOME$Correlation)>corrTh) +distTh=20e3 +mfa.EPITOME_distTh=subset(EPITOME,(EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") & + EPITOME$Distance<log(distTh+1,10)) +# Save INPUT matix objects +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME.RData") +if (!file.exists(fileOut)) {save(mfa.EPITOME,file = fileOut)} +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME_distTh.RData") +if (!file.exists(fileOut)) {save(mfa.EPITOME_distTh,file = fileOut)} +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME_corrTh.RData") +if (!file.exists(fileOut)) {save(mfa.EPITOME_corrTh,file = fileOut)} + +``` + +## Input matrix + +MFA is performed on the `r dim(mfa.EPITOME)[1]` peak-gene associations that include a DMR or a DEG. This is the distribution of quantitative and qualitative group variables. + +```{r MFA_getMtx2} +# Check matrix behaviour +boxplot(mfa.EPITOME[,c(26:37,18:19,4,5,20:25)],las=2,pch=19,cex=.5, + ylab="Scaled values by group",cex.axis=.8, + col=c(rep(palette()[8],12),rep(palette()[6],2), + rep(palette()[3],1),rep(palette()[4],1),rep(palette()[5],6)), + main="Distribution of quantitative variables for MFA") + +kable(table(mfa.EPITOME$DEG_kind,mfa.EPITOME$DEG_sign),caption = "DEGs for MFA") +kable(table(mfa.EPITOME$DM_kind,mfa.EPITOME$DM_sign),caption = "DMRs for MFA") +``` + +## MFA components description + +```{r MFA, fig.height=10, fig.width=10, cache=TRUE} + +#### MFA +colInds=c(26:37,18:19,4,5,20:25,7,10:11,8:9) + +# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh.RData") +# if (file.exists(fileOut)) {load(fileOut)} else { +# mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, +# group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), +# name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) +# save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh,file=fileOut)} +# +# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh.RData") +# if (file.exists(fileOut)) {load(fileOut)} else { +# mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh <- MFA(mfa.EPITOME_corrTh[,colInds], ncp=5, +# group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), +# name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) +# save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS.RData") +if (file.exists(fileOut)) {load(fileOut)} else { + mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS <- MFA(mfa.EPITOME[,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) +save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS,file=fileOut)} + +#### MFA stats +mfa.res=mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS +title="" ; xlim=c(-1,1) ; ylim=c(-1,1) +MFAstats(mfa.res,"","") +#print(plot.MFA(mfa.res,choix="axes", axes=c(1,2),title=title)) +plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), ylim=c(-1,2)) +p1 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), xlim=xlim, ylim=ylim) +p2 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,2), xlim=xlim, ylim=ylim) +p3 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,4), xlim=xlim, ylim=ylim) +p4 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(5,4), xlim=xlim, ylim=ylim) +print((p1 | p2) / (p3 | p4)) +``` + +# Hierarchical clustering on MFA components + +Clustering is performed on the first 5 components obtained with the MFA. + +```{r HCPCclustering, fig.height=10, fig.width=10, cache=TRUE} + +#### Clustering +nClust=12 ; maxClustSize=1400 +hcpc.res <- HCPC(mfa.res, nb.clust=nClust, conso=0, min=3, max=10, + metric = "euclidean", method="ward",graph = FALSE,nb.par = maxClustSize) + +#clusterStats(hcpc.res) + +# Cluster description in term of qualitative variables +lapply(hcpc.res[["desc.var"]][["category"]],function(x) {kable(subset(x,x[,"v.test"]>5.3))}) + +# Cluster description +tmp=hcpc.res[["data.clust"]][,23:28] +table=merge(mfa.EPITOME[,1:6],tmp,by=0) +colnames(table)[1]="peakIDgene_association" +colnames(table)[13]="MFAclust" + +layout(matrix(1:2,nrow = 2)) +boxplot(table$Distance ~ table$MFAclust,pch=19,cex=.5, + xlab="",ylab="Distance peak - gene") +boxplot(table$Correlation ~ table$MFAclust,pch=19,cex=.5, + xlab="MFA cluster",ylab="Correlation peak_hist-gene_exp") + +# Table of associations with clusters +fileOut=paste0(WDIR,"/results_project12891/tables/integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt") +if (!file.exists(fileOut)) { +# Add individual data +for (i in 1:nClust) { + table=merge(table,as.data.frame(hcpc.res[["desc.ind"]][["para"]][[i]]),by.x=1,by.y=0,all.x=TRUE) + colnames(table)[dim(table)[2]]=paste0("distFromCentroid_MFAclust",i) +} + +write.table(table,file = fileOut,sep="\t",quote=FALSE,row.names = FALSE, col.names = TRUE) +} + +``` + +## Statistics on clusters + +Number of total genes/peaks per cluster (Ngenes,Npeaks). Number of genes/peaks belonging only to a given cluster (NgeneOnlyClust, NpeakOnlyClust). Nearest cluster in terms of shared genes/peaks (nearestClustGene,nearestClustPeak) and corresponding number of shared genes/peaks (nearestClustNgene,nearestClustNpeak). + +```{r} +geneFreq_allAss=table(table$ENTREZID) +singleClustGenes=names(subset(geneFreq_allAss,geneFreq_allAss==1)) +peakFreq_allAss=table(table$peakID) +singleClustPeaks=names(subset(peakFreq_allAss,peakFreq_allAss==1)) +clusts=split(table,table$MFAclust) +geneClust=unique(table[,c(2,13)]) +peakClust=unique(table[,c(3,13)]) + +clustStats=data.frame(Counts=c("Ngenes","NgeneOnlyClust","nearestClustGene","nearestClustNgene", + "Npeaks","NpeaksOnlyClust","nearestClustPeak","nearestClustNpeak")) ; i=1 +for (clust in clusts) { + Ngenes=length(unique(clust$ENTREZID)) + NgeneOnlyClust=length(setdiff(unique(clust$ENTREZID), + subset(geneClust,geneClust$MFAclust!=i)$ENTREZID)) + nearestClustGene=names(sort(table(geneClust[geneClust$ENTREZID %in% unique(clust$ENTREZID),]$MFAclust),decreasing = T)[2]) + nearestClustNgene=as.numeric(sort(table(geneClust[geneClust$ENTREZID %in% unique(clust$ENTREZID),]$MFAclust),decreasing = T)[2]) + Npeaks=length(unique(clust$peakID)) + NpeaksOnlyClust=length(setdiff(unique(clust$peakID), + subset(peakClust,peakClust$MFAclust!=i)$peakID)) + nearestClustPeak=names(sort(table(peakClust[peakClust$peakID %in% unique(clust$peakID),]$MFAclust),decreasing = T)[2]) + nearestClustNpeak=as.numeric(sort(table(peakClust[peakClust$peakID %in% unique(clust$peakID),]$MFAclust),decreasing = T)[2]) + clustStats=cbind(clustStats,c(Ngenes,NgeneOnlyClust,nearestClustGene,nearestClustNgene,Npeaks,NpeaksOnlyClust,nearestClustPeak,nearestClustNpeak)) ; i=i+1 +} +colnames(clustStats)=c("Counts",seq(1,nClust,1)) +kable(clustStats) +``` + + +# Explore gene sets from functional analysis + +Check representation of selected gene sets (GS) among genes of every MFA cluster. Values correspond to the ratio between the observed number of genes belonging to a given GS and the expected number per cluster. + +```{r functions_genesetClusterRepresentation, cache=TRUE} +getClustGSenrich <- function(table,selectedGSs,m_t2g_lists) { + tmp=unique(table[,c(2,13)]) + t=table(tmp$MFAclust) + expected=t/sum(t) + overRep=data.frame(matrix(nrow = length(selectedGSs),ncol = nClust)) + colnames(overRep)=seq(1,nClust,1) ; row.names(overRep)=selectedGSs + for (selectedGS in selectedGSs) { + ENTREZ_IDs=unique(c(m_t2g_lists[[selectedGS]])) + subTable=tmp[tmp$ENTREZID %in% ENTREZ_IDs,] + + # Clustering distribution + obs=table(subTable$MFAclust)/sum(table(subTable$MFAclust)) + + overRep[selectedGS,] <- obs/expected + } + return(overRep) + +} + +m_t2g_df <- msigdbr(species = "Homo sapiens", category = "C2") %>% + dplyr::select(gs_subcat, gs_name, entrez_gene) +tmp=m_t2g_df[m_t2g_df$gs_subcat=="CP:REACTOME",] +m_t2g_list_reactome=split(as.character(tmp$entrez_gene),gsub("REACTOME_","",tmp$gs_name)) +# tmp=m_t2g_df[m_t2g_df$gs_subcat=="CP:KEGG",] +# m_t2g_list_kegg=split(as.character(tmp$entrez_gene),gsub("KEGG_","",tmp$gs_name)) + +# m_t2g_lists=c(m_t2g_list_reactome,m_t2g_list_kegg) +m_t2g_lists=m_t2g_list_reactome +``` + +## GS specific to 1st infection +```{r genesetClusterRepresentation_only1st} + +selectedGSs_only1st=c("NUCLEAR_RECEPTOR_TRANSCRIPTION_PATHWAY", + "CHEMOKINE_RECEPTORS_BIND_CHEMOKINES", + "CHROMATIN_MODIFYING_ENZYMES", + "PKMTS_METHYLATE_HISTONE_LYSINES", + "HATS_ACETYLATE_HISTONES", + "VESICLE_MEDIATED_TRANSPORT", + "MEMBRANE_TRAFFICKING", + "DNA_DOUBLE_STRAND_BREAK_RESPONSE", + "ANTIGEN_PROCESSING_UBIQUITINATION_PROTEASOME_DEGRADATION", + "PROCESSING_OF_DNA_DOUBLE_STRAND_BREAK_ENDS", + "DNA_DAMAGE_TELOMERE_STRESS_INDUCED_SENESCENCE") +tab=getClustGSenrich(table,selectedGSs_only1st,m_t2g_lists) +kable(tab,digits = rep(1,nClust)) +``` + +## GS specific to 2nd infection +```{r genesetClusterRepresentation_only2nd} +selectedGSs_only2nd=c("INFLUENZA_INFECTION", + "CELLULAR_RESPONSES_TO_EXTERNAL_STIMULI", + "INFECTIOUS_DISEASE","DISEASE", + "SIGNALING_BY_ROBO_RECEPTORS","SIGNALING_BY_NUCLEAR_RECEPTORS") + tab=getClustGSenrich(table,selectedGSs_only2nd,m_t2g_lists) +kable(tab,digits = rep(1,nClust)) +``` + +## GS common to both infections +```{r genesetClusterRepresentation_common} +selectedGSs_common=c("INTERLEUKIN_4_AND_INTERLEUKIN_13_SIGNALING", + "SIGNALING_BY_INTERLEUKINS","DNA_DOUBLE_STRAND_BREAK_REPAIR","DNA_REPAIR", + "FOXO_MEDIATED_TRANSCRIPTION", + "NEGATIVE_REGULATION_OF_MAPK_PATHWAY", + "CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM", + "INTERLEUKIN_10_SIGNALING", + "HDR_THROUGH_SINGLE_STRAND_ANNEALING_SSA_", + "FORMATION_OF_SENESCENCE_ASSOCIATED_HETEROCHROMATIN_FOCI_SAHF_") +tab=getClustGSenrich(table,selectedGSs_common,m_t2g_lists) +kable(tab,digits = rep(1,nClust)) + +``` + +```{bash searchSpecificGenes, include=FALSE, eval=FALSE} +cd /Volumes/bioit/12891_Chevalier_EpiMemStrep/results_project12891/tables/ +# Count genes only 2nd in all clusters +awk '$(NF-2)>.4 {print $2}' transcriptome/Sig_H3vsUI_D7vsUI_DIvsD7_DIvsH3_Annot.txt> id ; +grep -f id integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt|awk '{print $4,$(NF-12)}' | +sort -u | awk '{print $2}'|sort |uniq -c + +# Find genes only 2nd in cluster 7 +awk '$(NF-2)>.4 {print $2}' transcriptome/Sig_H3vsUI_D7vsUI_DIvsD7_DIvsH3_Annot.txt> id +grep -f id integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt| +awk '$(NF-12)==7 {print $4}' | sort -u + +rm id + +``` + + +# References + +```{r} +sessionInfo() +``` + + +```{r MFA_inputMatrixBenchmark, include=FALSE, eval=FALSE} +ass=sample(row.names(mfa.EPITOME_distTh), 500, replace = FALSE, prob = NULL) +xlim=c(-.6,.6) ; ylim=c(-.6,.6) + +title="TOME-METH_DIST-CORR-EPIG-CS-DEG-DMR" +colInds=c(26:37,18:19,4,5,20:25,7,10:11,8:9) +mfa.res_TOMEMETH_DISTCORREPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(3,4,5,6,7,8)) + +title="TOME-METH-DIST-CORR_EPIG-CS-DEG-DMR" +mfa.res_TOMEMETHDISTCORR_EPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"), + num.group.sup=c(5,6,7,8)) + +title="TOME-METH-DIST-CORR-EPIG_CS-DEG-DMR" +mfa.res_TOMEMETHDISTCORREPIG_CSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6,7,8)) + +title="TOME-METH-DIST-CORR-EPIG-CS_DEG-DMR" +mfa.res_TOMEMETHDISTCORREPIGCS_DEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(7,8)) + +title="TOME-METH-DIST-CORR-EPIG-CS-DEG_DMR" +mfa.res_TOMEMETHDISTCORREPIGCSDEG_DMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(8)) + +title="TOME-METH-DIST-CORR-EPIG-CS-DEG-DMR" +mfa.res_TOMEMETHDISTCORREPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR")) + +title="TOME-METH-DMR-DEG-DIST-CORR-EPIG_CS" +mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, + group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), + name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) + +results=list(TOMEMETH=mfa.res_TOMEMETH_DISTCORREPIGCSDEGDMR_distTh, + TOMEMETHDISTCORR=mfa.res_TOMEMETHDISTCORR_EPIGCSDEGDMR_distTh, + TOMEMETHDISTCORREPIG=mfa.res_TOMEMETHDISTCORREPIG_CSDEGDMR_distTh, + TOMEMETHDISTCORREPIGCS=mfa.res_TOMEMETHDISTCORREPIGCS_DEGDMR_distTh, + TOMEMETHDISTCORREPIGCSDEG=mfa.res_TOMEMETHDISTCORREPIGCSDEG_DMR_distTh, + TOMEMETHDISTCORREPIGCSDEGDMR=mfa.res_TOMEMETHDISTCORREPIGCSDEGDMR_distTh, + TOMEMETHDISTCORREPIGDEGDMR=mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh + ) + +xlim=c(-1,1) ; ylim=c(-1,1) +i=1 +for (mfa.res in results) { + title=names(results)[i] ; print(title) + print(kable(mfa.res[["eig"]][1:10,],caption = "Eigen values of global analysis")) + print(plot.MFA(mfa.res,choix="axes", axes=c(1,2),title=title)) + print(plot.MFA(mfa.res,choix="var", axes=c(1,2),title=title)) + print(plot.MFA(mfa.res,choix="var", axes=c(3,2),title=title)) + print(plot.MFA(mfa.res,choix="var", axes=c(3,4),title=title)) + plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2)) + plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), xlim=xlim, ylim=ylim) + plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,2), xlim=xlim, ylim=ylim) + plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,4), xlim=xlim, ylim=ylim) + i=i+1 +} + +MFAstats(mfa.res,xlim,ylim) + +################################### +################################### +# OTHER INPUT matrices + +mfa.EPITOME_notDM_distTh=subset(EPITOME,(EPITOME$DM_kind!="DM" & EPITOME$DEG_kind!="notDE") & + EPITOME$Distance<log(distTh+1,10)) + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_distTh.RData") +colInds=c(26:37,18:19,10:11,8:9) +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOMEMETH_distTh2 <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, + group=c(12,2,2,2), type=c(rep("c",2),rep("n",2)), + name.group=c("TOME","METH","DEG","DMR"), + num.group.sup=c(3,4)) +save(mfa.res_TOMEMETH_distTh2,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG.RData") +colInds=c(26:37,18:19,7,10:11,8:9) +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOMEMETH_EPIG <- MFA(mfa.EPITOME[,colInds], ncp=5, + group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)), + name.group=c("TOME","METH","EPIG","DEG","DMR"), + num.group.sup=c(3,4,5)) +save(mfa.res_TOMEMETH_EPIG,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_distTh.RData") +colInds=c(26:37,18:19,7,10:11,8:9) +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOMEMETH_EPIG_distTh <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, + group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)), + name.group=c("TOME","METH","EPIG","DEG","DMR"), + num.group.sup=c(3,4,5)) +save(mfa.res_TOMEMETH_EPIG_distTh,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_notDM_distTh.RData") +colInds=c(26:37,18:19,7,10:11,8:9) +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOMEMETH_EPIG_notDM_distTh <- MFA(mfa.EPITOME_notDM_distTh[,colInds], ncp=5, + group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)), + name.group=c("TOME","METH","EPIG","DEG","DMR"), + num.group.sup=c(3,4,5)) +save(mfa.res_TOMEMETH_EPIG_notDM_distTh,file=fileOut)} + + +``` + +```{r MFAold, eval=FALSE, include=FALSE} + +selPeaks=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID),selPeaks_DEGs_nearest)) +selPeaks_nearest=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID), + intersect(selPeaks_DEGs_nearest,selPeaksAllRep))) +selPeaks_target=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID), + intersect(selPeaks_DEGs_target,selPeaksAllRep))) +nPeaks=length(selPeaks) + +selPeaks_nearest=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID), + intersect(selPeaks_DEGs_nearest,selPeaksAllRep))) + +#### MFA +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_nearest.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOMEMETH_EPIG_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(7:18,5,6,25:30)]), ncp=5, + group=c(12,2,2,2,2), type=c(rep("c",2),rep("n",3)), + name.group=c("TOME","METH","EPIG","DM","DEG"), + num.group.sup=c(4,5)) +save(mfa.res_TOMEMETH_EPIG_nearest,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_target.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOMEMETH_EPIG_target <- MFA(na.omit(EPITOME[selPeaks_target,c(7:18,5,6,25:30)]), ncp=5, + group=c(12,2,2,2,2), type=c(rep("c",2),rep("n",3)), + name.group=c("TOME","METH","EPIG","DM","DEG"), + num.group.sup=c(4,5)) +save(mfa.res_TOMEMETH_EPIG_target,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIGMETH_nearest.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_EPIGMETH_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(5,6,19:30)]), ncp=5, + group=c(2,6,1,1,2,2), type=c(rep("c",2),rep("n",4)), + name.group=c("METH","EPIGENOME","LOC","CSkind","DM","DEG"), + num.group.sup=c(4)) +save(mfa.res_EPIGMETH_nearest,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIGMETH_target.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_EPIGMETH_target <- MFA(na.omit(EPITOME[selPeaks_target,c(5,6,19:30)]), ncp=5, + group=c(2,6,1,1,2,2), type=c(rep("c",2),rep("n",4)), + name.group=c("METH","EPIGENOME","LOC","CSkind","DM","DEG"), + num.group.sup=c(4)) +save(mfa.res_EPIGMETH_target,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIG_nearest.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_EPIG_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(19:30)]), ncp=5, + group=c(6,1,1,2,2), type=c(rep("c",1),rep("n",4)), + name.group=c("EPIGENOME","LOC","CSkind","METH","DEG"), + num.group.sup=c(3)) +save(mfa.res_EPIG_nearest,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIG_target.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_EPIG_target <- MFA(na.omit(EPITOME[selPeaks_target,c(19:30)]), ncp=5, + group=c(6,1,1,2,2), type=c(rep("c",1),rep("n",4)), + name.group=c("EPIGENOME","LOC","CSkind","METH","DEG"), + num.group.sup=c(3)) +save(mfa.res_EPIG_target,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOME_nearest.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOME_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(7:18,25:30)]), ncp=5, + group=c(12,1,1,2,2), type=c(rep("c",1),rep("n",4)), + name.group=c("TOME","LOC","CSkind","DM","DEG"), + num.group.sup=c(5)) +save(mfa.res_TOME_nearest,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOME_target.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_TOME_target <- MFA(na.omit(EPITOME[selPeaks_target,c(7:18,25:30)]), ncp=5, + group=c(12,1,1,2,2), type=c(rep("c",1),rep("n",4)), + name.group=c("TOME","LOC","CSkind","DM","DEG"), + num.group.sup=c(5)) +save(mfa.res_TOME_target,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEonly.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +tmp=na.omit(EPITOME[selPeaks_nearest,c(7:18,29:30)]) +tmp=subset(tmp,tmp$DEG_sign!="DownUp" & tmp$DEG_sign!="UpDown") +mfa.res_TOMEonly <- MFA(tmp, ncp=5, + group=c(12,2), type=c(rep("c",1),rep("n",1)), + name.group=c("TOME","DEG")) +save(mfa.res_TOMEonly,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_METHOME_nearest.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_METHOME_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(1:4,25:30)]), ncp=5, + group=c(4,1,1,2,2), type=c(rep("c",1),rep("n",4)), + name.group=c("METHOME","LOC","CSkind","DM","DEG"), + num.group.sup=c(4)) +save(mfa.res_METHOME_nearest,file=fileOut)} + +fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_METHOME_target.RData") +if (file.exists(fileOut)) {load(fileOut)} else { +mfa.res_METHOME_target <- MFA(na.omit(EPITOME[selPeaks_target,c(1:4,25:30)]), ncp=5, + group=c(4,1,1,2,2), type=c(rep("c",1),rep("n",4)), + name.group=c("METHOME","LOC","CSkind","DM","DEG"), + num.group.sup=c(4)) +save(mfa.res_METHOME_target,file=fileOut)} + + +mfa.res=mfa.res_TOMEMETH_EPIG_nearest +mfa.res=mfa.res_EPIG_nearest +mfa.res=mfa.res_EPIGMETH_nearest +mfa.res=mfa.res_TOME_nearest +mfa.res=mfa.res_TOMEonly ; nClust=8 +mfa.res=mfa.res_METHOME_nearest + +MFAstats(mfa.res) + + +#### Clustering + +hcpc.res <- HCPC(mfa.res, nb.clust=nClust, conso=0, min=3, max=10, + metric = "euclidean") + +clusterStats(hcpc.res) + +# d=dimdesc(mfa.res) +# +# #### Confidence ellipses around categories per variable +# plotellipses(mfa.res,axes = c(2,3), means=FALSE) +# plotellipses(mfa.res,axes = c(2,3), +# keepvar="DEG_kind",label = "none") ## for 1 variable + +``` + +```{r} +plot <- function(expected,obs,nClust) { + ggplot_df=rbind(as.data.frame(expected),as.data.frame(obs)) + ggplot_df$Set=c(rep("All",nClust),rep("Subset",nClust)) + colnames(ggplot_df)[1:2]=c("MFAcluster","Freq") + + p <- ggplot(data=ggplot_df, aes(x=Set, y=Freq, fill=MFAcluster)) + + geom_bar(stat="identity")+ + scale_fill_brewer(palette="Paired")+ + theme_minimal() + ggtitle(selectedGS) + print(p) + +} + +``` + -- GitLab