diff --git a/Wednesday/MAF/EPITOME_CS_integration.Rmd b/Wednesday/MAF/EPITOME_CS_integration.Rmd deleted file mode 100644 index 09d0d1090dc9e3457916a7cfe16a7d3f6848295b..0000000000000000000000000000000000000000 --- a/Wednesday/MAF/EPITOME_CS_integration.Rmd +++ /dev/null @@ -1,1107 +0,0 @@ ---- -title: "Epigenomic memory of infection: Data integration" -subtitle : "H3K4me2, transcriptome time courses and chromatin state" -author: "Claudia Chica" -date: "02.2021" -output: - html_document: - keep_md: yes - number_sections: yes - smart: no - toc: yes - toc_float: yes -editor_options: - chunk_output_type: console ---- - -```{r setup, message = FALSE, warning = FALSE, echo = FALSE} -knitr::opts_chunk$set(message=FALSE,warning=FALSE,echo=FALSE)#, cache=TRUE, cache.lazy = FALSE) -knitr::opts_knit$set(progress = FALSE, verbose = FALSE) -#knitr::opts_chunk$set(dev = "svg") - -WDIR="/Volumes/bioit/12891_Chevalier_EpiMemStrep" - -library(ggplot2) -library(plyranges) -library(RColorBrewer) -library(knitr) -library(pheatmap) -library(patchwork) -library(FactoMineR) -library(org.Hs.eg.db) -library(msigdbr) - - -``` - -# Methylation profile - -```{r loadEPIGresults} -# Load epigenomic counts -mark="H3K4me2" -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/countNormLength_",mark,".RData") -load(fileOut) - -# Discard ONE peak with negative normalised values in a SINGLE replicate (artefact from cycliclowess normalisation) -countNormLength_H3K4me2=countNormLength_H3K4me2[countNormLength_H3K4me2$norm.H3K4me2_UI_REP1>0,] - -# Get list of DM peaks -files=list.files(path = paste0(WDIR,"/results_project12891/tables/tables_v3_UCSC/",mark,"/"),pattern = "resAnDif_with_annotation_sorted.final.xls", full.names = T) -DM_results=list() ; i=1 -for (fileDM in files) { - no_col <- max(count.fields(fileDM, sep = "\t")) - tmp=read.table(fileDM,header = F,sep="\t", fill = TRUE,skip = 1, - col.names=1:no_col,colClasses = c("character",rep("numeric",16),rep("character",no_col-17))) - row.names(tmp) <- tmp[,1] - tmp=tmp[,14:17] - colnames(tmp) <- c("baseMean", "log2FoldChange", "pvalue", "padj") - tmp=tmp[row.names(countNormLength_H3K4me2),] - DM_results[[i]]=tmp ; i=i+1 -} -rm(tmp) -names(DM_results) <- c("D7vsH3","D7vsUI","H3vsUI") - -# DM profiles -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/DMpeaks_",mark,".RData") -load(fileOut) - -# Add not DEGs -DMpeaks_H3K4me2=intersect(DMpeaks_H3K4me2,row.names(DM_results$D7vsUI)) -tmp=setdiff(row.names(countNormLength_H3K4me2),DMpeaks_H3K4me2) -DMs=data.frame(peakID=DMpeaks_H3K4me2, - Kind="DM", -Sign=ifelse(DM_results$D7vsUI[DMpeaks_H3K4me2,]$log2FoldChange>=0,"Gain","Loss")) -notDMs=data.frame(peakID=tmp, - Kind="notDM", -Sign=ifelse(DM_results$D7vsUI[tmp,]$log2FoldChange>=0,"Gain","Loss")) -DMs=rbind(DMs,notDMs) -DMs$peakID=as.character(DMs$peakID) -rm(tmp,notDMs) - -kable(table(DMs$Kind,DMs$Sign)) -``` - -# Transcriptional profile -```{r loadTOMEresults} -# Expression data normalised and batch corrected -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/Data_RMAandNoBatch_Annot.RData") -load(fileOut) - -# DA results, same object used for the functional analysis -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/DE_results_Annot.RData") -load(fileOut) - -SYMBOL2ENTREZ=unique(data.frame(SYMBOL=as.character(DE_results$H3vsUI$SYMBOL), - ENTREZID=as.character(DE_results$H3vsUI$ENTREZID),stringsAsFactors = FALSE)) - -# DEGs padjTh=.05 ; lfcTh=0 -fileOut=paste0(WDIR,"/results_project12891/tables/transcriptome/DEGs_H3K4me2profiles_CompleteAnnotationPeaks_nearestGene.txt") -DEGs=read.table(fileOut,header = T)[,7:10] -DEGs=unique(na.omit(DEGs)) -DEGs$SYMBOL=as.character(DEGs$SYMBOL) -DEGs$ENTREZID=as.character(DEGs$ENTREZID) -# Add not DEGs -tmp=setdiff(SYMBOL2ENTREZ$ENTREZID,DEGs$ENTREZID) -H3vsUI=DE_results$H3vsUI[!duplicated(DE_results$H3vsUI$ENTREZID),] -DIvsD7=DE_results$DIvsD7[!duplicated(DE_results$DIvsD7$ENTREZID),] -notDEGs=data.frame(ENTREZID=SYMBOL2ENTREZ[SYMBOL2ENTREZ$ENTREZID %in% tmp,]$ENTREZID, - SYMBOL=SYMBOL2ENTREZ[SYMBOL2ENTREZ$ENTREZID %in% tmp,]$SYMBOL, - Kind="notDE", -# Sign=ifelse(H3vsUI[H3vsUI$ENTREZID %in% tmp,]$logFC>=0, -# ifelse(DIvsD7[DIvsD7$ENTREZID %in% tmp,]$logFC>=0,"Up","UpDown"), -# ifelse(DIvsD7[DIvsD7$ENTREZID %in% tmp,]$logFC<0,"Down","DownUp"))) -Sign=ifelse(H3vsUI[H3vsUI$ENTREZID %in% tmp,]$logFC>=0,"Up","Down")) -DEGs=rbind(DEGs,notDEGs) -DEGs$SYMBOL=as.character(DEGs$SYMBOL) -DEGs$ENTREZID=as.character(DEGs$ENTREZID) -rm(tmp,H3vsUI,DIvsD7,notDEGs) - -kable(table(DEGs$Kind,DEGs$Sign)) - -``` - -```{r load_peaks2Gene} -mark="H3K4me2" -### Nearest gene -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mergedPeaks_annotNearest_",mark,".RData") -load(fileOut) -peaks2GeneNearest_H3K4me2=data.frame( - peakId=mergedPeaks_annotNearest_H3K4me2@ranges@NAMES, - SYMBOL=as.character(mergedPeaks_annotNearest_H3K4me2@elementMetadata@listData[["mcols.GENESYMBOL"]]), - distance=mergedPeaks_annotNearest_H3K4me2@elementMetadata@listData[["distance"]], - stringsAsFactors=FALSE) -peaks2GeneNearest_H3K4me2=merge(peaks2GeneNearest_H3K4me2,SYMBOL2ENTREZ,by.x=2,by.y=1,all.x=TRUE)[,c(2,1,3,4)] - -### Target gene from MEME -# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mergedPeaks_annotTgene_",mark,".RData") -# load(fileOut) -# peaks2GeneTgene_H3K4me2=data.frame(peakId=mergedPeaks_annotTgene_H3K4me2$peakId, -# SYMBOL=as.character(mergedPeaks_annotTgene_H3K4me2$Gene_Name), -# ENTREZID=mergedPeaks_annotTgene_H3K4me2$ENTREZID, -# distance=0,stringsAsFactors=FALSE) - -# Load ENTREZ ENSEMBL ids mapping to cross Tgene and EPI-TOME information -ENSEMBLEids <- org.Hs.egENSEMBL -mapped_genes <- mappedkeys(ENSEMBLEids) -ENTREZtoENSEMBL <- as.data.frame(ENSEMBLEids[mapped_genes]) -colnames(ENTREZtoENSEMBL)=c("ENTREZID","ENSEMBLEID") - -# Load Tgene MEME results -fileIn=paste0(WDIR,"/results_project12891/MEME_Tgene/H2K4me2/links.tsv") -mergedPeaks_Tgene=read.table(fileIn,header = T,sep = "\t") -## Get only significant associations peak-gene -pvalTh=.05 #; FDRth=.5 -mergedPeaks_TgeneSig=subset(mergedPeaks_Tgene,mergedPeaks_Tgene$CnD_P_Value < pvalTh) -mergedPeaks_TgeneSig$chr=gsub("chr","",gsub(":[0-9-]*","",mergedPeaks_TgeneSig$RE_Locus)) -mergedPeaks_TgeneSig$start=as.character(as.integer(gsub("chr[0-9XY]*:","",gsub("-[0-9]*","",mergedPeaks_TgeneSig$RE_Locus)))-1) -mergedPeaks_TgeneSig$end=gsub(".*-","",mergedPeaks_TgeneSig$RE_Locus) -mergedPeaks_TgeneSig$peakID=apply(mergedPeaks_TgeneSig[,18:20],1,function(x){paste0(x[1],"_",x[2],"_",x[3])}) -mergedPeaks_TgeneSig$ENSEMBL=gsub("\\.[0-9-]*","",mergedPeaks_TgeneSig$Gene_ID) -peaks2GeneTgene_H3K4me2=na.omit(merge(mergedPeaks_TgeneSig, - ENTREZtoENSEMBL,by.x=22,by.y=2,all.x=TRUE)[,c(22,23,10,14,17)]) - -# Select unique gene-region associations -# select association minimun Tgene pvalue -peaks2GeneTgene_H3K4me2$peak2gene=paste0(peaks2GeneTgene_H3K4me2$peakID,"_", - peaks2GeneTgene_H3K4me2$ENTREZID) -peaks2GeneTgene_H3K4me2$tmp=paste0(peaks2GeneTgene_H3K4me2$peakID,"_", - peaks2GeneTgene_H3K4me2$ENTREZID,"_",as.character(peaks2GeneTgene_H3K4me2$CnD_P_Value)) -peaks2GeneTgene_H3K4me2=peaks2GeneTgene_H3K4me2[!duplicated(peaks2GeneTgene_H3K4me2$tmp),] - -tmp=aggregate(peaks2GeneTgene_H3K4me2$CnD_P_Value,list(peaks2GeneTgene_H3K4me2$peak2gene),min) -tmp$tmp=paste0(tmp$Group.1,"_",as.character(tmp$x)) -peaks2GeneTgene_H3K4me2=peaks2GeneTgene_H3K4me2[peaks2GeneTgene_H3K4me2$tmp %in% tmp$tmp,-7] - -rm(mergedPeaks_TgeneSig,ENSEMBLEids,mapped_genes,ENTREZtoENSEMBL) -``` - -# Chromatin states - -```{r selectPeaks, eval=FALSE, include=FALSE} -# Select peaks using counts -logFCthUp_H3K4me2=.005 #quantile .75 -logFCthDown_H3K4me2=-.005 #quantile .25 -tmp=countNormLength_H3K4me2 -selPeaksAllRep=row.names( - subset(tmp,( - (log(tmp$norm.H3K4me2_D7_REP1/tmp$norm.H3K4me2_UI_REP1,2) > logFCthUp_H3K4me2 & - log(tmp$norm.H3K4me2_D7_REP2/tmp$norm.H3K4me2_UI_REP2,2) > logFCthUp_H3K4me2) | - (log(tmp$norm.H3K4me2_D7_REP1/tmp$norm.H3K4me2_UI_REP1,2) < logFCthDown_H3K4me2 & - log(tmp$norm.H3K4me2_D7_REP2/tmp$norm.H3K4me2_UI_REP2,2) < logFCthDown_H3K4me2)))) - -selPeaksAll=row.names(tmp) - -# Select peaks related to DEGs -selPeaks_DEGs_nearest=unique(merge(DEGs[DEGs$Kind!="notDE",],peaks2GeneNearest_H3K4me2[,c(4,1)],by=1)[,5]) -selPeaks_DEGs_target=unique(merge(DEGs[DEGs$Kind!="notDE",],peaks2GeneTgene_H3K4me2[,c(3,1)],by=1)[,5]) - -print(c("Number of:")) -print(c("Total peaks",dim(tmp)[1])) -print(c("Selected peaks - methylation logFold",length(selPeaksAll))) -print(c("Selected peaks - reproducible methylation logFold",length(selPeaksAllRep))) -print(c("Selected peaks - nearest to DEG",length(selPeaks_DEGs_nearest))) -print(c("Selected peaks - with DEG target",length(selPeaks_DEGs_target))) - -print(c("Methylation logFold nearest to DEG",length(intersect(selPeaksAll,selPeaks_DEGs_nearest)))) -print(c("Methylation logFold with DEG target",length(intersect(selPeaksAll,selPeaks_DEGs_target)))) -print(c("Reproducible methylation logFold nearest to DEG",length(intersect(selPeaksAllRep,selPeaks_DEGs_nearest)))) -print(c("Reproducible methylation logFold with DEG target",length(intersect(selPeaksAllRep,selPeaks_DEGs_target)))) - -ggplotdf=data.frame(log2FC=c( - DM_results$D7vsUI[selPeaksAll,]$log2FoldChange, - DM_results$D7vsUI[selPeaksAllRep,]$log2FoldChange, - DM_results$D7vsUI[selPeaks_DEGs_nearest,]$log2FoldChange, - DM_results$D7vsUI[selPeaks_DEGs_target,]$log2FoldChange), - peakPopulation=factor(c(rep("All peaks",length(selPeaksAll)), - rep("Reproducible Log2FC",length(selPeaksAllRep)), - rep("DEGs nearest",length(selPeaks_DEGs_nearest)), - rep("DEGs target",length(selPeaks_DEGs_target))), - levels = c("All peaks","Reproducible Log2FC","DEGs nearest","DEGs target"))) - -ggplot(ggplotdf, aes(x=log2FC,fill=peakPopulation)) + - geom_density(alpha=0.5) + facet_grid(~ peakPopulation) -``` - -```{r get_mergedPeaks_annotTXS} -# Load all merged peaks coords -fileIn=paste0(WDIR,"/results_project12891/tables/tables_v3_UCSC/",mark,"/",mark,"_mergedPeaks.bed") -mergedPeaks=read_bed(fileIn) - -# Annotate merged peaks -fileAnnot=paste0(WDIR,"/results_project12891/igv/igv_session_v3/annotation/hg19_transcripts.geneSymbol.reformat.bed") -txs_gr <- read_bed(fileAnnot) -tss_gr <- GRanges(seqnames = txs_gr@seqnames, - ranges = IRanges(start = txs_gr@ranges@start-1e3, end = txs_gr@ranges@start+1e3), - strand = txs_gr@strand, - mcols = data.frame(GENESYMBOL=mcols(txs_gr)$name)) -txs_gr <- GRanges(seqnames = txs_gr@seqnames, - ranges = txs_gr@ranges, - strand = txs_gr@strand, - mcols = data.frame(GENESYMBOL=mcols(txs_gr)$name)) - -mergedPeaks_annotTXS=array(NA,dim=length(mergedPeaks)) ; names(mergedPeaks_annotTXS)=mergedPeaks$name -ovtss=unique(findOverlaps(mergedPeaks,tss_gr)@from) -mergedPeaks_annotTXS[ovtss]="TSS" -ovintraG=setdiff(unique(findOverlaps(mergedPeaks,txs_gr)@from),ovtss) -mergedPeaks_annotTXS[ovintraG]="intraG" -mergedPeaks_annotTXS[is.na(mergedPeaks_annotTXS)]="interG" -``` - -```{r annotationQuantification} -DIR=paste0(WDIR,"/results_project12891/Encode_data/bigWig/") -fileMetadata=paste0(DIR,"ENCODE_metadata_bigWig_foldChangeOverControl.released.txt") -Encode_metadata <- read.table(file=fileMetadata,header=TRUE,sep="\t") - -marks=c("H3K4me3", "H3K4me2", "H3K27ac", "H3K4me1", "H3K79me2", "H3K27me3") -AnnotationCols <- data.frame( - Mark=factor(sub("-human","",Encode_metadata$Experiment.target),levels = marks), - row.names = Encode_metadata$File.accession) - -colsG=brewer.pal(n = 3, name = "Greys") -colsM=brewer.pal(n = 4, name = "Greens") -colsS=brewer.pal(n = 3, name = "Blues")[2:3] -nClust=8 ; colsC=brewer.pal(n = nClust, name = "Set2") -colsAnnot <- list(Mark=c("H3K4me3"=colsS[1], "H3K4me2"=colsS[2], "H3K27ac"=colsM[1], "H3K4me1"=colsM[2], "H3K79me2"=colsM[3], "H3K27me3"=colsM[4]), - DM_sign=c("Gain"="#FF0000", "Loss"="#FF000030"), - DM_kind=c("DM"="#000000", "notDM"="transparent"), - Localization=c("TSS"=colsG[1], "intraG"=colsG[2], "interG"=colsG[3]), - CS=c("1"=colsC[1],"2"=colsC[2],"3"=colsC[3],"4"=colsC[4],"5"=colsC[5],"6"=colsC[6],"7"=colsC[7],"8"=colsC[8])) -``` - -```{bash getMtx_ENCODElogchange, eval=FALSE, include=FALSE} -WDIR=/pasteur/projets/policy01/BioIT/12891_Chevalier_EpiMemStrep/results_project12891/ -module load UCSC-tools/v373 -module load bedtools/2.25.0 - -# 1. Download bigWig files -# 2. Get bedGraph files -cd Encode_data/bigWig/ -for file in *.bigWig ; do -if [ ! -f ${file/bigWig/bedGraph} ] ; then echo $file ; -srun bigWigToBedGraph $file ${file/bigWig/bedGraph} & fi ; done - -# 3. Overlap with mergedPeaks -mark=H3K4me2 -filePeaks=$WDIR/tables/tables_v3_UCSC/*/${mark}_mergedPeaks.bed -cd $WDIR/Encode_data/bigWig - -# 3.1. Small files -emacs overlapBedGraph_mergedPeaks.sh -#!/bin/sh -i=$(head -n ${SLURM_ARRAY_TASK_ID} $1 | tail -n 1) -WDIR=$2 -filePeaks=$3 -mark=$4 - -if [ ! -f $WDIR/${i/.bedGraph/_mergedPeaks_${mark}.ov} ] ; then -srun -o $i.ovb -e $i.e intersectBed -a $filePeaks -b $i -wb -awk '{s[$4]=s[$4]+$NF ; n[$4]=n[$4]+1} END {for (p in n) {print p"\t"s[p]/n[p]}}' $i.ovb | -sort -k1,1 > $WDIR/${i/.bedGraph/_mergedPeaks_${mark}.ov} ; fi - -ls *bedGraph > fileList.txt -n=`wc -l fileList.txt | awk '{print $1}'` -sbatch -p hubbioit --qos hubbioit -o ov.o -e ov.e --mem=200G --array=1-$n%$n overlapBedGraph_mergedPeaks.sh fileList.txt $WDIR/Encode_data/bigWig/ $filePeaks $mark - -# 4 Assemble matrix -echo "Peak_name" > header0 ; i=0 -for file in *$mark*ov ; do let j=$i+1 -echo ${file/_mergedPeaks_${mark}.ov} | paste header$i - > header$j ; ((i++)) ; done -paste *$mark*ov | awk '{line=$1 ; for (i=2;i<=NF;i=i+2) {line=line"\t"$i} ; print line}' | -cat header$j - > ENCODE_mergedPeaks_${mark}.ov - -# Clean -rm $WDIR/Encode_data/bigWig/*.ovb $WDIR/Encode_data/bigWig/*.bedGraph -rm $WDIR/Encode_data/bigWig/*.e $WDIR/Encode_data/bigWig/ov.o -rm header* -``` - -```{r ENCODE_bigWig} -DIR=paste0(WDIR,"/results_project12891/Encode_data/bigWig/") -fileSignal=paste0(DIR,"ENCODE_mergedPeaks_",mark,".ov") -matSignal <- read.table(file=fileSignal,header=TRUE,row.names = 1,sep="\t") - -markOrder=c(#H3K4me3 - "ENCFF628ANV", "ENCFF510LTC", "ENCFF556OVF", - #"ENCFF313GRK", "ENCFF709KWY", "ENCFF635VEW", - "ENCFF250OYQ", "ENCFF189JCB", "ENCFF421ZNH", - #H3K4me2 - "ENCFF876DTJ", "ENCFF975YWP", "ENCFF419LFZ", - #H3K4me1 - "ENCFF165ZPD", "ENCFF569ZJQ", "ENCFF613NHX", - #H3K27ac - #"ENCFF216GUZ", "ENCFF597SRK", "ENCFF562LEK", - "ENCFF137KNW", "ENCFF103BLQ", "ENCFF663ILW", - #H3K79me2 - "ENCFF892LYD", "ENCFF154HXA", "ENCFF931LYX", - #H3K27me3 - "ENCFF819QSK", "ENCFF134YLO", "ENCFF336AWS") -``` - -```{r ENCODE_stats} -ENCODE_stats <- function(matSignalSel,selPeaks,mergedPeaks_annotTXS,DMs) { -allZero=which(apply(matSignalSel,1,sum)==0) -if (length(allZero)>0) matSignalSel=matSignalSel[-allZero,] - -mtx=t(matSignalSel) -d=dist(t(scale(mtx))) -hc=hclust(d,method = "ward.D2") -nClust=8 -clust=cutree(hc,k=nClust) - -selPeaks=intersect(selPeaks,row.names(matSignalSel)) - -# Complete AnnotationRows -AnnotationRows <- data.frame( - Localization=factor(mergedPeaks_annotTXS[selPeaks],levels = c("TSS","intraG","interG")), - CS=factor(clust[selPeaks]), - row.names = selPeaks -) -AnnotationRows=merge(AnnotationRows,DMs,by.x=0,by.y=1) -row.names(AnnotationRows)=AnnotationRows$Row.names ; AnnotationRows=AnnotationRows[,-1] -colnames(AnnotationRows)[3:4]=c("DM_kind","DM_sign") - -return(AnnotationRows) -} -``` - -```{r ENCODE_heatmap, fig.height=10, fig.width=8} -ENCODE_heatmap <- function(matSignalSel,AnnotationRows,colsAnnot) { -breaks=seq(-4,4,.1) ; cols=colorRampPalette(c("purple","black","yellow"))(length(breaks)+1) - -allZero=which(apply(matSignalSel,1,sum)==0) -if (length(allZero)>0) matSignalSel=matSignalSel[-allZero,] - -pheatmap(matSignalSel, - scale = "row", cluster_rows = T, cluster_cols = F, - clustering_method = "ward.D2", - show_rownames = F, show_colnames = F, - annotation_col = subset(AnnotationCols,row.names(AnnotationCols) %in% markOrder), - annotation_row = AnnotationRows, - annotation_colors = colsAnnot, - color = cols, border = NA, - #main = paste(mark,"DM peaks",length(DMpeaks)), - fontsize = 12) -} -``` - -## All H3K4me2 peaks - -```{r ENCODE_counts, fig.height=10, fig.width=8, cache=TRUE} -selPeaksAll=row.names(countNormLength_H3K4me2) -selPeaks=selPeaksAll -matSignalSel=na.omit(matSignal[selPeaks,markOrder]) -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/AnnotationRows_allPeaks.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -AnnotationRows=ENCODE_stats(matSignalSel,selPeaks,mergedPeaks_annotTXS,DMs) -AnnotationRows_allPeaks=AnnotationRows -save(AnnotationRows_allPeaks,file=fileOut)} - -# Tables -AnnotationRows=AnnotationRows_allPeaks -t=table(AnnotationRows$CS) -kable(ceiling(t/sum(t)*100),col.names = c("CS","Freq")) - -t1=table(AnnotationRows$Localization) -tt=table(AnnotationRows$CS,AnnotationRows$Localization) -kable(ceiling(t1/sum(t1)*100),col.names = c("Localization","Freq")) -kable(ceiling(tt/as.numeric(t)*100),row.names = T) -# -# t1=table(AnnotationRows$DM_sign) -# kable(ceiling(t1/sum(t1)*100),col.names = c("H3K4me2 Profile","Freq")) -# tt=table(AnnotationRows$CS,AnnotationRows$DM_sign) -# kable(ceiling(tt/as.numeric(t)*100),row.names = T) -# -# t1=table(AnnotationRows$DM_kind) -# kable(ceiling(t1/sum(t1)*100),col.names = c("H3K4me2 Dynamic","Freq")) -# tt=table(AnnotationRows$CS,AnnotationRows$DM_kind) -# kable(ceiling(tt/as.numeric(t)*100),row.names = T) - -ENCODE_heatmap(matSignalSel,AnnotationRows,colsAnnot) -``` - -# Multi factorial analysis - -Multi factorial analysis (MFA) (Escofier & Pages, Computational Statistics & Data Analysis, 1994) studies several groups of variables (numerical and/or categorical) defined on the same set of individuals. MFA is a factor analysis applied to the array including all groups of variables. Roughly, the behaviour of the method is equivalent to PCA (concerning quantitative variables) or to MCA (concerning qualitative variables). - - -```{r MFA_functions} -#http://juliejosse.com/wp-content/uploads/2019/03/MFA_staf2015.pdf - -individualContributionPlot <- function(mfa.res,nPeaks) { -contribs=data.frame() -for (i in 1:5) { -rs=mfa.res[["ind"]][["contrib"]][order(mfa.res[["ind"]][["contrib"]][,i],decreasing = TRUE),i] %>% cumsum() -peakId=names(rs) -contribs=rbind(contribs,data.frame(peakId,seq(1,length(rs),1),as.vector(rs),rep(i,length(rs))))} -colnames(contribs)=c("peakId","Rank","RankedSum","Dimension") -contribs$Dimension=factor(contribs$Dimension) - -p <- ggplot(contribs,aes(x=Rank,y=RankedSum)) + geom_point(aes(color=Dimension)) - -print(p) - -return(contribs) -} - -MFAstats <- function(mfa.res,xlim,ylim) { -# Eigen values of separate analysis -# Describe dimensionality of each variable group -#print(kable(sapply(mfa.res[["separate.analyses"]],"[[", 1), -# caption = "Eigen values of separate analysis")) -# Eigen values of global analysis -# How many components are necessary to describe the total inertia of the dataset -print(kable(mfa.res[["eig"]][1:10,], - caption = "Eigen values of global analysis")) -# Correlation coeffs -# How much of the stt defined by component C depends on groups(s) G : G -> C -# Fundamental to see if there are structures COMMON to groups -> useful to integrate -print(kable(mfa.res[["group"]][["correlation"]],caption = "Group correlation with factor")) -# Inertia or group contribution -# How much of the inertia of group G is explained by component C : C -> G -print(kable(mfa.res[["group"]][["cos2"]],caption = "Group inertia per factor")) -# Group contribution -plotGrouprepresentation(mfa.res) -# Invidual contribution to each component -# Determine wheter an axis is due to only some individuals -#mfa.res[["ind"]][["contrib"]] -#individualContributions=individualContributionPlot(mfa.res,nPeaks) -# Categorical variable representation (centroid of inds ) -# Determine how are -#dimdesc(mfa.res)$Dim.2 -#splotIndrepresentation(mfa.res,xlim,ylim) -# Variable contribution -plotVarrepresentation(mfa.res) -} - -plotGrouprepresentation <- function(mfa.res) { -p1 <- plot.MFA(mfa.res,choix = "group",axes = c(1,2), title = "") -p2 <- plot.MFA(mfa.res,choix = "group",axes = c(1,3), title = "") -p3 <- plot.MFA(mfa.res,choix = "group",axes = c(1,4), title = "") -p4 <- plot.MFA(mfa.res,choix = "group",axes = c(1,5), title = "") -print((p1 | p2) / (p3 | p4)) -} - -plotIndrepresentation <- function(mfa.res,xlim,ylim) { - if (ylim!="" & xlim!="") { -p1 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,2), - title = "",ylim=ylim,xlim=xlim) -p2 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,3), - title = "",ylim=ylim,xlim=xlim) -p3 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,4), - title = "",ylim=ylim,xlim=xlim) -p4 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,5), - title = "",ylim=ylim,xlim=xlim) } else -{ -p1 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,2), title = "") -p2 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,3), title = "") -p3 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,4), title = "") -p4 <- plot.MFA(mfa.res,choix = "ind", invisible = "ind", axes = c(1,5), title = "") - -} -print((p1 | p2) / (p3 | p4)) -} - -plotVarrepresentation <- function(mfa.res) { - title="" - p1 <- plot.MFA(mfa.res,choix="var", axes=c(1,2),title=title,cex=.7) - p2 <- plot.MFA(mfa.res,choix="var", axes=c(1,3),title=title,cex=.7) - p3 <- plot.MFA(mfa.res,choix="var", axes=c(1,4),title=title,cex=.7) - p4 <- plot.MFA(mfa.res,choix="var", axes=c(1,5),title=title,cex=.7) - -print((p1 | p2) / (p3 | p4) + plot_layout(guides = "collect")) - -} - -clusterStats <- function(hcpc.res) { -### Cluster description -## By variables: -# Test if variable (OMIC samples) values distribution per cluster is random or not -# For quantitative variables -print(kable(hcpc.res[["desc.var"]]$quanti,caption = "Description by quantitative variables")) -# For qualitative variables -print(kable(hcpc.res[["desc.var"]][["category"]],caption = "Description by qualitative variables")) -## By components -# Test if individuals coordinates distribution per cluster is random or not -# For quantitative variables -print(kable(hcpc.res[["desc.axes"]][["quanti"]], - caption = "Description by factors (quantitative variables)")) -# For qualitative variables -print(kable(hcpc.res[["desc.var"]][["test.chi2"]], - caption = "Description by factors (qualitative variables)")) -## By individuals (genomic regions) -# Nearest individuals to the cluster's centroid -print("Description by individuals (nearest):") -print(hcpc.res[["desc.ind"]]$para) -# Furthest individuals to the cluster's centroid -print("Description by individuals (furthest):") -print(hcpc.res[["desc.ind"]]$dist) -} - -``` - -```{r MFA_prepareMtx} -# Differential analysis info -AnnotationRows=AnnotationRows_allPeaks[,2:4] -#DEGs2peaks=merge(DEGs,peaks2GeneTgene_H3K4me2[,c(3,1)],by=1) -#AnnotationRows=merge(AnnotationRows,DEGs2peaks[,c(5,4,3,1)],by.x=0,by.y=1) -DEGs2peaks=merge(DEGs,peaks2GeneTgene_H3K4me2,by.x=1,by.y=2) -AnnotationRows=merge(AnnotationRows,DEGs2peaks[,c(5,3,4,1,2,6:8)],by.x=0,by.y=1) -AnnotationRows$Distance=log(abs(AnnotationRows$Distance)+1,10) -#AnnotationRows=AnnotationRows[!duplicated(AnnotationRows[,1]),] -#AnnotationRows <- AnnotationRows[,-c(1,8)] ; -colnames(AnnotationRows)[5:6] <- c("DEG_kind","DEG_sign") -AnnotationRows$DEG_kind=factor(AnnotationRows$DEG_kind,levels = c("common","only_H3vsUI","only_DIvsD7","notDE")) -#AnnotationRows$DEG_sign=factor(AnnotationRows$DEG_sign,levels = c("Up","Down","UpDown","DownUp")) -AnnotationRows$DEG_sign=factor(AnnotationRows$DEG_sign,levels = c("Up","Down")) -#Re-order columns -AnnotationRows=AnnotationRows[,c(1,7:11,2:6)] - -# Merge methylation, transcription and epigenome signal -METHOME=merge(t(scale(t(countNormLength_H3K4me2))), - DM_results$D7vsUI[,1:2],by=0) -colnames(METHOME)[8:9]=c("H3K4me2_log2baseMean","H3K4me2_D7vsUI_log2FC") -METHOME$H3K4me2_log2baseMean=log(METHOME$H3K4me2_log2baseMean,2) - -# Merge TOME data -Data=Data_RMA_NoBatchComBat_Annot -t=na.omit(Data[!duplicated(Data$ENTREZID),])[,c(1,5,9,2,6,10,3,7,11,4,8,12)] -TOME=log(t,2) -row.names(TOME)=na.omit(Data[!duplicated(Data$ENTREZID),14]) - -# Merge ENCODE data -samples=AnnotationCols[markOrder,] -names(samples)=markOrder -ENCODEmean=t(scale(t( - data.frame( - H3K4me3=apply(matSignal[,names(samples[samples=="H3K4me3"])],1,function(x) mean((x))), - H3K4me2=apply(matSignal[,names(samples[samples=="H3K4me2"])],1,function(x) mean((x))), - H3K4me1=apply(matSignal[,names(samples[samples=="H3K4me1"])],1,function(x) mean((x))), - H3K27ac=apply(matSignal[,names(samples[samples=="H3K27ac"])],1,function(x) mean((x))), - H3K79me2=apply(matSignal[,names(samples[samples=="H3K79me2"])],1,function(x) mean((x))), - H3K27me3=apply(matSignal[,names(samples[samples=="H3K27me3"])],1,function(x) mean((x))), - row.names = row.names(matSignal)) -))) - -# EPITOME all -EPITOME=merge(AnnotationRows,METHOME,by=1) -EPITOME=merge(EPITOME,ENCODEmean,by.x=1,by.y=0) -EPITOME=merge(EPITOME,TOME,by.x=2,by.y=0) -row.names(EPITOME)=paste0(EPITOME$Row.names,"_",EPITOME$ENTREZID) -colnames(EPITOME)[2]="peakID" -``` - -```{r MFA_getMtx1} - -#### Get INPUT matrix - -# Select all peak2gene associations where there is a DMR or a DEG -mfa.EPITOME=subset(EPITOME,EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") -corrTh=.4 -mfa.EPITOME_corrTh=subset(EPITOME,(EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") & - abs(EPITOME$Correlation)>corrTh) -distTh=20e3 -mfa.EPITOME_distTh=subset(EPITOME,(EPITOME$DM_kind=="DM" | EPITOME$DEG_kind!="notDE") & - EPITOME$Distance<log(distTh+1,10)) -# Save INPUT matix objects -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME.RData") -if (!file.exists(fileOut)) {save(mfa.EPITOME,file = fileOut)} -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME_distTh.RData") -if (!file.exists(fileOut)) {save(mfa.EPITOME_distTh,file = fileOut)} -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.EPITOME_corrTh.RData") -if (!file.exists(fileOut)) {save(mfa.EPITOME_corrTh,file = fileOut)} - -``` - -## Input matrix - -MFA is performed on the `r dim(mfa.EPITOME)[1]` peak-gene associations that include a DMR or a DEG. This is the distribution of quantitative and qualitative group variables. - -```{r MFA_getMtx2} -# Check matrix behaviour -boxplot(mfa.EPITOME[,c(26:37,18:19,4,5,20:25)],las=2,pch=19,cex=.5, - ylab="Scaled values by group",cex.axis=.8, - col=c(rep(palette()[8],12),rep(palette()[6],2), - rep(palette()[3],1),rep(palette()[4],1),rep(palette()[5],6)), - main="Distribution of quantitative variables for MFA") - -kable(table(mfa.EPITOME$DEG_kind,mfa.EPITOME$DEG_sign),caption = "DEGs for MFA") -kable(table(mfa.EPITOME$DM_kind,mfa.EPITOME$DM_sign),caption = "DMRs for MFA") -``` - -## MFA components description - -```{r MFA, fig.height=10, fig.width=10, cache=TRUE} - -#### MFA -colInds=c(26:37,18:19,4,5,20:25,7,10:11,8:9) - -# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh.RData") -# if (file.exists(fileOut)) {load(fileOut)} else { -# mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, -# group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), -# name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) -# save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh,file=fileOut)} -# -# fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh.RData") -# if (file.exists(fileOut)) {load(fileOut)} else { -# mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh <- MFA(mfa.EPITOME_corrTh[,colInds], ncp=5, -# group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), -# name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) -# save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_corrTh,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS.RData") -if (file.exists(fileOut)) {load(fileOut)} else { - mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS <- MFA(mfa.EPITOME[,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) -save(mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS,file=fileOut)} - -#### MFA stats -mfa.res=mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS -title="" ; xlim=c(-1,1) ; ylim=c(-1,1) -MFAstats(mfa.res,"","") -#print(plot.MFA(mfa.res,choix="axes", axes=c(1,2),title=title)) -plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), ylim=c(-1,2)) -p1 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), xlim=xlim, ylim=ylim) -p2 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,2), xlim=xlim, ylim=ylim) -p3 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,4), xlim=xlim, ylim=ylim) -p4 <- plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(5,4), xlim=xlim, ylim=ylim) -print((p1 | p2) / (p3 | p4)) -``` - -# Hierarchical clustering on MFA components - -Clustering is performed on the first 5 components obtained with the MFA. - -```{r HCPCclustering, fig.height=10, fig.width=10, cache=TRUE} - -#### Clustering -nClust=12 ; maxClustSize=1400 -hcpc.res <- HCPC(mfa.res, nb.clust=nClust, conso=0, min=3, max=10, - metric = "euclidean", method="ward",graph = FALSE,nb.par = maxClustSize) - -#clusterStats(hcpc.res) - -# Cluster description in term of qualitative variables -lapply(hcpc.res[["desc.var"]][["category"]],function(x) {kable(subset(x,x[,"v.test"]>5.3))}) - -# Cluster description -tmp=hcpc.res[["data.clust"]][,23:28] -table=merge(mfa.EPITOME[,1:6],tmp,by=0) -colnames(table)[1]="peakIDgene_association" -colnames(table)[13]="MFAclust" - -layout(matrix(1:2,nrow = 2)) -boxplot(table$Distance ~ table$MFAclust,pch=19,cex=.5, - xlab="",ylab="Distance peak - gene") -boxplot(table$Correlation ~ table$MFAclust,pch=19,cex=.5, - xlab="MFA cluster",ylab="Correlation peak_hist-gene_exp") - -# Table of associations with clusters -fileOut=paste0(WDIR,"/results_project12891/tables/integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt") -if (!file.exists(fileOut)) { -# Add individual data -for (i in 1:nClust) { - table=merge(table,as.data.frame(hcpc.res[["desc.ind"]][["para"]][[i]]),by.x=1,by.y=0,all.x=TRUE) - colnames(table)[dim(table)[2]]=paste0("distFromCentroid_MFAclust",i) -} - -write.table(table,file = fileOut,sep="\t",quote=FALSE,row.names = FALSE, col.names = TRUE) -} - -``` - -## Statistics on clusters - -Number of total genes/peaks per cluster (Ngenes,Npeaks). Number of genes/peaks belonging only to a given cluster (NgeneOnlyClust, NpeakOnlyClust). Nearest cluster in terms of shared genes/peaks (nearestClustGene,nearestClustPeak) and corresponding number of shared genes/peaks (nearestClustNgene,nearestClustNpeak). - -```{r} -geneFreq_allAss=table(table$ENTREZID) -singleClustGenes=names(subset(geneFreq_allAss,geneFreq_allAss==1)) -peakFreq_allAss=table(table$peakID) -singleClustPeaks=names(subset(peakFreq_allAss,peakFreq_allAss==1)) -clusts=split(table,table$MFAclust) -geneClust=unique(table[,c(2,13)]) -peakClust=unique(table[,c(3,13)]) - -clustStats=data.frame(Counts=c("Ngenes","NgeneOnlyClust","nearestClustGene","nearestClustNgene", - "Npeaks","NpeaksOnlyClust","nearestClustPeak","nearestClustNpeak")) ; i=1 -for (clust in clusts) { - Ngenes=length(unique(clust$ENTREZID)) - NgeneOnlyClust=length(setdiff(unique(clust$ENTREZID), - subset(geneClust,geneClust$MFAclust!=i)$ENTREZID)) - nearestClustGene=names(sort(table(geneClust[geneClust$ENTREZID %in% unique(clust$ENTREZID),]$MFAclust),decreasing = T)[2]) - nearestClustNgene=as.numeric(sort(table(geneClust[geneClust$ENTREZID %in% unique(clust$ENTREZID),]$MFAclust),decreasing = T)[2]) - Npeaks=length(unique(clust$peakID)) - NpeaksOnlyClust=length(setdiff(unique(clust$peakID), - subset(peakClust,peakClust$MFAclust!=i)$peakID)) - nearestClustPeak=names(sort(table(peakClust[peakClust$peakID %in% unique(clust$peakID),]$MFAclust),decreasing = T)[2]) - nearestClustNpeak=as.numeric(sort(table(peakClust[peakClust$peakID %in% unique(clust$peakID),]$MFAclust),decreasing = T)[2]) - clustStats=cbind(clustStats,c(Ngenes,NgeneOnlyClust,nearestClustGene,nearestClustNgene,Npeaks,NpeaksOnlyClust,nearestClustPeak,nearestClustNpeak)) ; i=i+1 -} -colnames(clustStats)=c("Counts",seq(1,nClust,1)) -kable(clustStats) -``` - - -# Explore gene sets from functional analysis - -Check representation of selected gene sets (GS) among genes of every MFA cluster. Values correspond to the ratio between the observed number of genes belonging to a given GS and the expected number per cluster. - -```{r functions_genesetClusterRepresentation, cache=TRUE} -getClustGSenrich <- function(table,selectedGSs,m_t2g_lists) { - tmp=unique(table[,c(2,13)]) - t=table(tmp$MFAclust) - expected=t/sum(t) - overRep=data.frame(matrix(nrow = length(selectedGSs),ncol = nClust)) - colnames(overRep)=seq(1,nClust,1) ; row.names(overRep)=selectedGSs - for (selectedGS in selectedGSs) { - ENTREZ_IDs=unique(c(m_t2g_lists[[selectedGS]])) - subTable=tmp[tmp$ENTREZID %in% ENTREZ_IDs,] - - # Clustering distribution - obs=table(subTable$MFAclust)/sum(table(subTable$MFAclust)) - - overRep[selectedGS,] <- obs/expected - } - return(overRep) - -} - -m_t2g_df <- msigdbr(species = "Homo sapiens", category = "C2") %>% - dplyr::select(gs_subcat, gs_name, entrez_gene) -tmp=m_t2g_df[m_t2g_df$gs_subcat=="CP:REACTOME",] -m_t2g_list_reactome=split(as.character(tmp$entrez_gene),gsub("REACTOME_","",tmp$gs_name)) -# tmp=m_t2g_df[m_t2g_df$gs_subcat=="CP:KEGG",] -# m_t2g_list_kegg=split(as.character(tmp$entrez_gene),gsub("KEGG_","",tmp$gs_name)) - -# m_t2g_lists=c(m_t2g_list_reactome,m_t2g_list_kegg) -m_t2g_lists=m_t2g_list_reactome -``` - -## GS specific to 1st infection -```{r genesetClusterRepresentation_only1st} - -selectedGSs_only1st=c("NUCLEAR_RECEPTOR_TRANSCRIPTION_PATHWAY", - "CHEMOKINE_RECEPTORS_BIND_CHEMOKINES", - "CHROMATIN_MODIFYING_ENZYMES", - "PKMTS_METHYLATE_HISTONE_LYSINES", - "HATS_ACETYLATE_HISTONES", - "VESICLE_MEDIATED_TRANSPORT", - "MEMBRANE_TRAFFICKING", - "DNA_DOUBLE_STRAND_BREAK_RESPONSE", - "ANTIGEN_PROCESSING_UBIQUITINATION_PROTEASOME_DEGRADATION", - "PROCESSING_OF_DNA_DOUBLE_STRAND_BREAK_ENDS", - "DNA_DAMAGE_TELOMERE_STRESS_INDUCED_SENESCENCE") -tab=getClustGSenrich(table,selectedGSs_only1st,m_t2g_lists) -kable(tab,digits = rep(1,nClust)) -``` - -## GS specific to 2nd infection -```{r genesetClusterRepresentation_only2nd} -selectedGSs_only2nd=c("INFLUENZA_INFECTION", - "CELLULAR_RESPONSES_TO_EXTERNAL_STIMULI", - "INFECTIOUS_DISEASE","DISEASE", - "SIGNALING_BY_ROBO_RECEPTORS","SIGNALING_BY_NUCLEAR_RECEPTORS") - tab=getClustGSenrich(table,selectedGSs_only2nd,m_t2g_lists) -kable(tab,digits = rep(1,nClust)) -``` - -## GS common to both infections -```{r genesetClusterRepresentation_common} -selectedGSs_common=c("INTERLEUKIN_4_AND_INTERLEUKIN_13_SIGNALING", - "SIGNALING_BY_INTERLEUKINS","DNA_DOUBLE_STRAND_BREAK_REPAIR","DNA_REPAIR", - "FOXO_MEDIATED_TRANSCRIPTION", - "NEGATIVE_REGULATION_OF_MAPK_PATHWAY", - "CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM", - "INTERLEUKIN_10_SIGNALING", - "HDR_THROUGH_SINGLE_STRAND_ANNEALING_SSA_", - "FORMATION_OF_SENESCENCE_ASSOCIATED_HETEROCHROMATIN_FOCI_SAHF_") -tab=getClustGSenrich(table,selectedGSs_common,m_t2g_lists) -kable(tab,digits = rep(1,nClust)) - -``` - -```{bash searchSpecificGenes, include=FALSE, eval=FALSE} -cd /Volumes/bioit/12891_Chevalier_EpiMemStrep/results_project12891/tables/ -# Count genes only 2nd in all clusters -awk '$(NF-2)>.4 {print $2}' transcriptome/Sig_H3vsUI_D7vsUI_DIvsD7_DIvsH3_Annot.txt> id ; -grep -f id integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt|awk '{print $4,$(NF-12)}' | -sort -u | awk '{print $2}'|sort |uniq -c - -# Find genes only 2nd in cluster 7 -awk '$(NF-2)>.4 {print $2}' transcriptome/Sig_H3vsUI_D7vsUI_DIvsD7_DIvsH3_Annot.txt> id -grep -f id integration/mfa_clust_TOMEMETHDISTCORREPIGDEGDMR_CS.txt| -awk '$(NF-12)==7 {print $4}' | sort -u - -rm id - -``` - - -# References - -```{r} -sessionInfo() -``` - - -```{r MFA_inputMatrixBenchmark, include=FALSE, eval=FALSE} -ass=sample(row.names(mfa.EPITOME_distTh), 500, replace = FALSE, prob = NULL) -xlim=c(-.6,.6) ; ylim=c(-.6,.6) - -title="TOME-METH_DIST-CORR-EPIG-CS-DEG-DMR" -colInds=c(26:37,18:19,4,5,20:25,7,10:11,8:9) -mfa.res_TOMEMETH_DISTCORREPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(3,4,5,6,7,8)) - -title="TOME-METH-DIST-CORR_EPIG-CS-DEG-DMR" -mfa.res_TOMEMETHDISTCORR_EPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"), - num.group.sup=c(5,6,7,8)) - -title="TOME-METH-DIST-CORR-EPIG_CS-DEG-DMR" -mfa.res_TOMEMETHDISTCORREPIG_CSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6,7,8)) - -title="TOME-METH-DIST-CORR-EPIG-CS_DEG-DMR" -mfa.res_TOMEMETHDISTCORREPIGCS_DEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(7,8)) - -title="TOME-METH-DIST-CORR-EPIG-CS-DEG_DMR" -mfa.res_TOMEMETHDISTCORREPIGCSDEG_DMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(8)) - -title="TOME-METH-DIST-CORR-EPIG-CS-DEG-DMR" -mfa.res_TOMEMETHDISTCORREPIGCSDEGDMR_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR")) - -title="TOME-METH-DMR-DEG-DIST-CORR-EPIG_CS" -mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh <- MFA(mfa.EPITOME_distTh[ass,colInds], ncp=5, - group=c(12,2,1,1,6,1,2,2), type=c(rep("c",5),rep("n",3)), - name.group=c("TOME","METH","DIST","CORR","EPIG","CS","DEG","DMR"),num.group.sup=c(6)) - -results=list(TOMEMETH=mfa.res_TOMEMETH_DISTCORREPIGCSDEGDMR_distTh, - TOMEMETHDISTCORR=mfa.res_TOMEMETHDISTCORR_EPIGCSDEGDMR_distTh, - TOMEMETHDISTCORREPIG=mfa.res_TOMEMETHDISTCORREPIG_CSDEGDMR_distTh, - TOMEMETHDISTCORREPIGCS=mfa.res_TOMEMETHDISTCORREPIGCS_DEGDMR_distTh, - TOMEMETHDISTCORREPIGCSDEG=mfa.res_TOMEMETHDISTCORREPIGCSDEG_DMR_distTh, - TOMEMETHDISTCORREPIGCSDEGDMR=mfa.res_TOMEMETHDISTCORREPIGCSDEGDMR_distTh, - TOMEMETHDISTCORREPIGDEGDMR=mfa.res_TOMEMETHDISTCORREPIGDEGDMR_CS_distTh - ) - -xlim=c(-1,1) ; ylim=c(-1,1) -i=1 -for (mfa.res in results) { - title=names(results)[i] ; print(title) - print(kable(mfa.res[["eig"]][1:10,],caption = "Eigen values of global analysis")) - print(plot.MFA(mfa.res,choix="axes", axes=c(1,2),title=title)) - print(plot.MFA(mfa.res,choix="var", axes=c(1,2),title=title)) - print(plot.MFA(mfa.res,choix="var", axes=c(3,2),title=title)) - print(plot.MFA(mfa.res,choix="var", axes=c(3,4),title=title)) - plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2)) - plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(1,2), xlim=xlim, ylim=ylim) - plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,2), xlim=xlim, ylim=ylim) - plot.MFA(mfa.res,choix="ind", invisible="ind", axes=c(3,4), xlim=xlim, ylim=ylim) - i=i+1 -} - -MFAstats(mfa.res,xlim,ylim) - -################################### -################################### -# OTHER INPUT matrices - -mfa.EPITOME_notDM_distTh=subset(EPITOME,(EPITOME$DM_kind!="DM" & EPITOME$DEG_kind!="notDE") & - EPITOME$Distance<log(distTh+1,10)) - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_distTh.RData") -colInds=c(26:37,18:19,10:11,8:9) -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOMEMETH_distTh2 <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, - group=c(12,2,2,2), type=c(rep("c",2),rep("n",2)), - name.group=c("TOME","METH","DEG","DMR"), - num.group.sup=c(3,4)) -save(mfa.res_TOMEMETH_distTh2,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG.RData") -colInds=c(26:37,18:19,7,10:11,8:9) -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOMEMETH_EPIG <- MFA(mfa.EPITOME[,colInds], ncp=5, - group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)), - name.group=c("TOME","METH","EPIG","DEG","DMR"), - num.group.sup=c(3,4,5)) -save(mfa.res_TOMEMETH_EPIG,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_distTh.RData") -colInds=c(26:37,18:19,7,10:11,8:9) -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOMEMETH_EPIG_distTh <- MFA(mfa.EPITOME_distTh[,colInds], ncp=5, - group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)), - name.group=c("TOME","METH","EPIG","DEG","DMR"), - num.group.sup=c(3,4,5)) -save(mfa.res_TOMEMETH_EPIG_distTh,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_notDM_distTh.RData") -colInds=c(26:37,18:19,7,10:11,8:9) -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOMEMETH_EPIG_notDM_distTh <- MFA(mfa.EPITOME_notDM_distTh[,colInds], ncp=5, - group=c(12,2,1,2,2), type=c(rep("c",2),rep("n",3)), - name.group=c("TOME","METH","EPIG","DEG","DMR"), - num.group.sup=c(3,4,5)) -save(mfa.res_TOMEMETH_EPIG_notDM_distTh,file=fileOut)} - - -``` - -```{r MFAold, eval=FALSE, include=FALSE} - -selPeaks=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID),selPeaks_DEGs_nearest)) -selPeaks_nearest=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID), - intersect(selPeaks_DEGs_nearest,selPeaksAllRep))) -selPeaks_target=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID), - intersect(selPeaks_DEGs_target,selPeaksAllRep))) -nPeaks=length(selPeaks) - -selPeaks_nearest=unique(c(as.character(DMs[DMs$Kind=="DM",]$peakID), - intersect(selPeaks_DEGs_nearest,selPeaksAllRep))) - -#### MFA -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_nearest.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOMEMETH_EPIG_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(7:18,5,6,25:30)]), ncp=5, - group=c(12,2,2,2,2), type=c(rep("c",2),rep("n",3)), - name.group=c("TOME","METH","EPIG","DM","DEG"), - num.group.sup=c(4,5)) -save(mfa.res_TOMEMETH_EPIG_nearest,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEMETH_EPIG_target.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOMEMETH_EPIG_target <- MFA(na.omit(EPITOME[selPeaks_target,c(7:18,5,6,25:30)]), ncp=5, - group=c(12,2,2,2,2), type=c(rep("c",2),rep("n",3)), - name.group=c("TOME","METH","EPIG","DM","DEG"), - num.group.sup=c(4,5)) -save(mfa.res_TOMEMETH_EPIG_target,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIGMETH_nearest.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_EPIGMETH_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(5,6,19:30)]), ncp=5, - group=c(2,6,1,1,2,2), type=c(rep("c",2),rep("n",4)), - name.group=c("METH","EPIGENOME","LOC","CSkind","DM","DEG"), - num.group.sup=c(4)) -save(mfa.res_EPIGMETH_nearest,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIGMETH_target.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_EPIGMETH_target <- MFA(na.omit(EPITOME[selPeaks_target,c(5,6,19:30)]), ncp=5, - group=c(2,6,1,1,2,2), type=c(rep("c",2),rep("n",4)), - name.group=c("METH","EPIGENOME","LOC","CSkind","DM","DEG"), - num.group.sup=c(4)) -save(mfa.res_EPIGMETH_target,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIG_nearest.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_EPIG_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(19:30)]), ncp=5, - group=c(6,1,1,2,2), type=c(rep("c",1),rep("n",4)), - name.group=c("EPIGENOME","LOC","CSkind","METH","DEG"), - num.group.sup=c(3)) -save(mfa.res_EPIG_nearest,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_EPIG_target.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_EPIG_target <- MFA(na.omit(EPITOME[selPeaks_target,c(19:30)]), ncp=5, - group=c(6,1,1,2,2), type=c(rep("c",1),rep("n",4)), - name.group=c("EPIGENOME","LOC","CSkind","METH","DEG"), - num.group.sup=c(3)) -save(mfa.res_EPIG_target,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOME_nearest.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOME_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(7:18,25:30)]), ncp=5, - group=c(12,1,1,2,2), type=c(rep("c",1),rep("n",4)), - name.group=c("TOME","LOC","CSkind","DM","DEG"), - num.group.sup=c(5)) -save(mfa.res_TOME_nearest,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOME_target.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_TOME_target <- MFA(na.omit(EPITOME[selPeaks_target,c(7:18,25:30)]), ncp=5, - group=c(12,1,1,2,2), type=c(rep("c",1),rep("n",4)), - name.group=c("TOME","LOC","CSkind","DM","DEG"), - num.group.sup=c(5)) -save(mfa.res_TOME_target,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_TOMEonly.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -tmp=na.omit(EPITOME[selPeaks_nearest,c(7:18,29:30)]) -tmp=subset(tmp,tmp$DEG_sign!="DownUp" & tmp$DEG_sign!="UpDown") -mfa.res_TOMEonly <- MFA(tmp, ncp=5, - group=c(12,2), type=c(rep("c",1),rep("n",1)), - name.group=c("TOME","DEG")) -save(mfa.res_TOMEonly,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_METHOME_nearest.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_METHOME_nearest <- MFA(na.omit(EPITOME[selPeaks_nearest,c(1:4,25:30)]), ncp=5, - group=c(4,1,1,2,2), type=c(rep("c",1),rep("n",4)), - name.group=c("METHOME","LOC","CSkind","DM","DEG"), - num.group.sup=c(4)) -save(mfa.res_METHOME_nearest,file=fileOut)} - -fileOut=paste0(WDIR,"/results_project12891/scripts/Robjects/mfa.res_METHOME_target.RData") -if (file.exists(fileOut)) {load(fileOut)} else { -mfa.res_METHOME_target <- MFA(na.omit(EPITOME[selPeaks_target,c(1:4,25:30)]), ncp=5, - group=c(4,1,1,2,2), type=c(rep("c",1),rep("n",4)), - name.group=c("METHOME","LOC","CSkind","DM","DEG"), - num.group.sup=c(4)) -save(mfa.res_METHOME_target,file=fileOut)} - - -mfa.res=mfa.res_TOMEMETH_EPIG_nearest -mfa.res=mfa.res_EPIG_nearest -mfa.res=mfa.res_EPIGMETH_nearest -mfa.res=mfa.res_TOME_nearest -mfa.res=mfa.res_TOMEonly ; nClust=8 -mfa.res=mfa.res_METHOME_nearest - -MFAstats(mfa.res) - - -#### Clustering - -hcpc.res <- HCPC(mfa.res, nb.clust=nClust, conso=0, min=3, max=10, - metric = "euclidean") - -clusterStats(hcpc.res) - -# d=dimdesc(mfa.res) -# -# #### Confidence ellipses around categories per variable -# plotellipses(mfa.res,axes = c(2,3), means=FALSE) -# plotellipses(mfa.res,axes = c(2,3), -# keepvar="DEG_kind",label = "none") ## for 1 variable - -``` - -```{r} -plot <- function(expected,obs,nClust) { - ggplot_df=rbind(as.data.frame(expected),as.data.frame(obs)) - ggplot_df$Set=c(rep("All",nClust),rep("Subset",nClust)) - colnames(ggplot_df)[1:2]=c("MFAcluster","Freq") - - p <- ggplot(data=ggplot_df, aes(x=Set, y=Freq, fill=MFAcluster)) + - geom_bar(stat="identity")+ - scale_fill_brewer(palette="Paired")+ - theme_minimal() + ggtitle(selectedGS) - print(p) - -} - -``` - diff --git a/Wednesday/MFA/RData/AnnotationRows.RData b/Wednesday/MFA/RData/AnnotationRows.RData new file mode 100644 index 0000000000000000000000000000000000000000..cb60233e617b42b24bbd0546cb70ae9528ed5245 Binary files /dev/null and b/Wednesday/MFA/RData/AnnotationRows.RData differ diff --git a/Wednesday/MFA/RData/DEGs.RData b/Wednesday/MFA/RData/DEGs.RData new file mode 100644 index 0000000000000000000000000000000000000000..c7a361070a07cf871b67e4039b7acbe08309e5e6 Binary files /dev/null and b/Wednesday/MFA/RData/DEGs.RData differ diff --git a/Wednesday/MFA/RData/DE_results_Annot.RData b/Wednesday/MFA/RData/DE_results_Annot.RData new file mode 100644 index 0000000000000000000000000000000000000000..f5e986e96e3b8164310994061a25f2d0bd64cf70 Binary files /dev/null and b/Wednesday/MFA/RData/DE_results_Annot.RData differ diff --git a/Wednesday/MFA/RData/DM_results_H3K4me2.RData b/Wednesday/MFA/RData/DM_results_H3K4me2.RData new file mode 100644 index 0000000000000000000000000000000000000000..151bb398221a7e5f40a079075185432fef1f4924 Binary files /dev/null and b/Wednesday/MFA/RData/DM_results_H3K4me2.RData differ diff --git a/Wednesday/MFA/RData/DMpeaks_H3K4me2.RData b/Wednesday/MFA/RData/DMpeaks_H3K4me2.RData new file mode 100644 index 0000000000000000000000000000000000000000..ff7c6adf0ac33fd6f20bf70c306bb4b22465eee4 Binary files /dev/null and b/Wednesday/MFA/RData/DMpeaks_H3K4me2.RData differ diff --git a/Wednesday/MFA/RData/H3K4me2_allPeaks_CS.RData b/Wednesday/MFA/RData/H3K4me2_allPeaks_CS.RData new file mode 100644 index 0000000000000000000000000000000000000000..e09d7df23eca1cab0c9540566c7cdc06a290d518 Binary files /dev/null and b/Wednesday/MFA/RData/H3K4me2_allPeaks_CS.RData differ diff --git a/Wednesday/MFA/RData/TOME_RMAandNoBatch.RData b/Wednesday/MFA/RData/TOME_RMAandNoBatch.RData new file mode 100644 index 0000000000000000000000000000000000000000..861f3d7d959f0faff01818727cebaeb7fd89b2da Binary files /dev/null and b/Wednesday/MFA/RData/TOME_RMAandNoBatch.RData differ diff --git a/Wednesday/MFA/RData/countNormLength_H3K4me2.RData b/Wednesday/MFA/RData/countNormLength_H3K4me2.RData new file mode 100644 index 0000000000000000000000000000000000000000..b7befa2d82a48aaa80dc851703e96a581d0019e6 Binary files /dev/null and b/Wednesday/MFA/RData/countNormLength_H3K4me2.RData differ diff --git a/Wednesday/MFA/RData/mfa.EPITOME.RData b/Wednesday/MFA/RData/mfa.EPITOME.RData new file mode 100644 index 0000000000000000000000000000000000000000..f2c22b1fdd16878897898f124f22a37503771c29 Binary files /dev/null and b/Wednesday/MFA/RData/mfa.EPITOME.RData differ