From 56d5a95e949773f82ba0e7c6c75aee80ba40bd1c Mon Sep 17 00:00:00 2001 From: Hugo VARET <hugo.varet@pasteur.fr> Date: Wed, 2 Mar 2022 11:45:24 +0100 Subject: [PATCH] modif_report --- DESCRIPTION | 6 +++--- R/MAPlot.R | 3 ++- R/PCAPlot.R | 7 ++++--- R/barplotTotal.R | 1 + R/clusterPlot.R | 1 + R/countsBoxplots.R | 6 ++++-- R/densityPlot.R | 31 ------------------------------- R/dispersionsPlot.R | 3 ++- R/pairwiseScatterPlots.R | 9 +++++---- R/rawpHist.R | 3 ++- inst/Report_ChIPflowR.Rmd | 16 +++++----------- man/densityPlot.Rd | 32 -------------------------------- 12 files changed, 29 insertions(+), 89 deletions(-) delete mode 100755 R/densityPlot.R delete mode 100644 man/densityPlot.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 29f0045..c193629 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ChIPflowR Type: Package Title: Differential analysis of ChIP-Seq data -Version: 0.99.0 -Date: 2020-05-07 +Version: 0.99.1 +Date: 2022-03-02 Author: Marie-Agnes Dillies, Hugo Varet and Maëlle Daunesse Maintainer: Maëlle Daunesse <maelle.daunesse@pasteur.fr> Depends: R (>= 3.4.0), DESeq2, limma, kableExtra, knitr, ggplot2 (>= 3.3.0) @@ -15,4 +15,4 @@ Description: Provide R tools and an environment for the differential perform statistical analysis/testing with DESeq2 or limma, export results and create final report. License: GPL-2 -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.2 diff --git a/R/MAPlot.R b/R/MAPlot.R index 45de871..a2f8619 100755 --- a/R/MAPlot.R +++ b/R/MAPlot.R @@ -35,7 +35,8 @@ MAPlot <- function(results, method, alpha=0.05, outfile=FALSE, log2FClim=NULL){ scale_fill_manual(values=c("no"="black", "yes"="red"), drop=FALSE) + scale_y_continuous(expand=expansion(mult=c(0.03, 0.03))) + ylab(expression(log[2]~fold~change)) + - ggtitle(paste0("MA-plot - ", gsub("_"," ",name))) + ggtitle(paste0("MA-plot - ", gsub("_"," ",name))) + + theme_light() if (method=="DESeq2"){ p[[name]] <- p[[name]] + scale_x_continuous(trans = log10_trans(), diff --git a/R/PCAPlot.R b/R/PCAPlot.R index 7cc9ea0..f1050ce 100755 --- a/R/PCAPlot.R +++ b/R/PCAPlot.R @@ -28,12 +28,13 @@ PCAPlot <- function(counts.trans, conditions, n=min(500, nrow(counts.trans)), d <- data.frame(x=pca$x[,index1], y=pca$x[,index2], conditions=conditions, sample=factor(rownames(pca$x), levels=rownames(pca$x))) ggplot(data=d, aes(x=.data$x, y=.data$y, color=conditions, label=sample)) + - geom_point(show.legend=TRUE, size=3) + + geom_point(show.legend=FALSE, size=2) + labs(color="") + scale_colour_manual(values=col) + - geom_text_repel(show.legend=FALSE, size=5, point.padding=0.2) + + geom_text_repel(show.legend=FALSE, size=4, point.padding=0.2) + xlab(paste0("PC", index1, " (",prp[index1],"%)")) + - ylab(paste0("PC", index2, " (",prp[index2],"%)")) + ylab(paste0("PC", index2, " (",prp[index2],"%)")) + + theme_light() } p1 <- tmpFunction(c(1, 2)) p2 <- tmpFunction(c(1, 3)) diff --git a/R/barplotTotal.R b/R/barplotTotal.R index afa03a5..6d319d0 100755 --- a/R/barplotTotal.R +++ b/R/barplotTotal.R @@ -23,6 +23,7 @@ barplotTotal <- function(counts, conditions, ylab("Total read count (million)") + scale_y_continuous(expand=expansion(mult=c(0.01, 0.05))) + ggtitle("Total read count per sample (million)") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))) if (outfile) dev.off() } diff --git a/R/clusterPlot.R b/R/clusterPlot.R index 8ec9b12..f7eb793 100755 --- a/R/clusterPlot.R +++ b/R/clusterPlot.R @@ -16,6 +16,7 @@ clusterPlot <- function(counts.trans, conditions, outfile=FALSE){ xlab("Samples") + ylab("Height") + ggtitle("Cluster dendrogram\nEuclidean distance, Ward criterion") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5), axis.text.y=element_text(angle=0)) + scale_y_continuous(expand=expansion(mult=c(0.01, 0.05)))) diff --git a/R/countsBoxplots.R b/R/countsBoxplots.R index 9a8b14b..a6d19dd 100755 --- a/R/countsBoxplots.R +++ b/R/countsBoxplots.R @@ -28,24 +28,26 @@ countsBoxplots <- function(results, conditions, method, d <- stack(as.data.frame(log2(as.matrix(counts)))) d$conditions <- rep(conditions, each=nrow(counts)) p1 <- ggplot(d) + - geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=TRUE) + + geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=FALSE) + labs(fill="") + scale_fill_manual(values=col) + xlab("Samples") + ylab("Raw counts (log scale)") + ggtitle("Raw counts distribution") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) # norm counts if (method=="DESeq2") norm.counts <- log2(as.matrix(norm.counts)) d <- stack(as.data.frame(norm.counts)) d$conditions <- rep(conditions, each=nrow(norm.counts)) p2 <- ggplot(d) + - geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=TRUE) + + geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=FALSE) + labs(fill="") + scale_fill_manual(values=col) + xlab("Samples") + ylab("Normalized counts (log scale)") + ggtitle("Normalized counts distribution") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) grid.arrange(p1, p2, nrow=1, ncol=2) if (outfile) dev.off() diff --git a/R/densityPlot.R b/R/densityPlot.R deleted file mode 100755 index e4ad710..0000000 --- a/R/densityPlot.R +++ /dev/null @@ -1,31 +0,0 @@ -#' Density plot of all samples -#' -#' Estimation the counts density for each sample -#' -#' @name densityPlot -#' @param counts \code{matrix} of counts -#' @param conditions factor vector of the condition from which each sample belongs -#' @param col colors of the curves (one per biological condition) -#' @param outfile TRUE to export the figure in a png file -#' @return A density plot for each sample -#' @author Marie-Agnes Dillies, Hugo Varet and Maëlle Daunesse - -densityPlot <- function(counts, conditions, - col=c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", "#f99379", "#604e97"), - outfile=TRUE){ - if (outfile) png(filename="Densityplot_DESeq2.png",width=1800,height=1800,res=300) - counts <- removeNull(counts) - d <- stack(data.frame(counts)) - d$conditions <- rep(conditions, each=nrow(counts)) - print(ggplot(d, aes(x=.data$values+1)) + - stat_density(aes(group=.data$ind, color=.data$conditions), position="identity", geom="line", show.legend=TRUE) + - scale_x_continuous(trans = log10_trans(), - breaks = trans_breaks("log10", function(x) 10^x), - labels = trans_format("log10", math_format(~10^.x))) + - labs(color="") + - scale_colour_manual(values=col) + - xlab("Raw counts") + - ylab("Density") + - ggtitle("Density of counts distribution")) - if (outfile) dev.off() -} diff --git a/R/dispersionsPlot.R b/R/dispersionsPlot.R index e1b37a0..a2ef6ce 100755 --- a/R/dispersionsPlot.R +++ b/R/dispersionsPlot.R @@ -32,7 +32,8 @@ dispersionsPlot <- function(dds, outfile=FALSE){ labels=c("Estimate", "Final", "Fit"), name="") + guides(colour = guide_legend(override.aes = list(size=2))) + - ggtitle("Dispersions") + ggtitle("Dispersions") + + theme_light() print(p) if (outfile) dev.off() } diff --git a/R/pairwiseScatterPlots.R b/R/pairwiseScatterPlots.R index f5cff42..0439290 100755 --- a/R/pairwiseScatterPlots.R +++ b/R/pairwiseScatterPlots.R @@ -28,15 +28,16 @@ pairwiseScatterPlots <- function(counts, outfile=FALSE){ scale_y_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(~10^.x))) + - xlab(names(d)[i]) + ylab(names(d)[j]) + - geom_abline(slope=1, intercept=0, linetype="dashed", col="lightgrey") + xlab(names(d)[i]) + + ylab(names(d)[j]) + + geom_abline(slope=1, intercept=0, linetype="dashed", col="lightgrey") + + theme_light() } else{ text = as.character(round(SERE(counts[,c(i,j)]), digits=2)) p[[k]] <- ggplot() + annotate("text", x=4, y=25, size=6, label=text) + theme_bw() + xlab(names(d)[i]) + ylab(names(d)[j]) + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), - axis.text.x=element_blank(), axis.ticks.x=element_blank(), - axis.text.y=element_blank(), axis.ticks.y=element_blank()) + axis.text=element_blank(), axis.ticks=element_blank()) } layout_matrix[j,i] <- k k <- k+1 diff --git a/R/rawpHist.R b/R/rawpHist.R index 99a3b4e..8d7787c 100755 --- a/R/rawpHist.R +++ b/R/rawpHist.R @@ -20,7 +20,8 @@ rawpHist <- function(result, outfile=FALSE){ scale_y_continuous(expand=expansion(mult=c(0.01, 0.05))) + xlab("Raw p-value") + ylab("Frequency") + - ggtitle(paste0("Distribution of raw p-values - ", gsub("_"," ",name))) + ggtitle(paste0("Distribution of raw p-values - ", gsub("_"," ",name))) + + theme_light() } tmpfun <- function(...) grid.arrange(..., nrow=nrow, ncol=ncol) do.call(tmpfun, p) diff --git a/inst/Report_ChIPflowR.Rmd b/inst/Report_ChIPflowR.Rmd index 72f9a89..022d1e2 100644 --- a/inst/Report_ChIPflowR.Rmd +++ b/inst/Report_ChIPflowR.Rmd @@ -122,7 +122,6 @@ kable(apply(counts,2,fun_summary), caption = "Table 3: Summary of the raw counts Figure 1 shows the total number of mapped and counted reads for each sample. Total read counts are expected to be similar within conditions, they may be different across conditions. Total counts sometimes vary widely between replicates. ```{r barplot, echo=FALSE,fig.align="center",fig.cap="Figure 1: Number of mapped reads per sample. Colors refer to the biological condition of the sample.", out.width="600px"} -# Producing Barplot barplotTotal(counts = counts, conditions = Conditions) ``` @@ -132,8 +131,7 @@ A pairwise scatter plot is produced (figure 2) to show how replicates and sample - 1 for technical replicates (technical variability follows a Poisson distribution) - greater than 1 for biological replicates and samples from different biological conditions (biological variability is higher than technical one, data are over-dispersed with respect to Poisson). The higher the SERE value, the lower the similarity. It is expected to be lower between biological replicates than between samples of different biological conditions. Hence, the SERE statistic can be used to detect inversions between samples. -```{r pairewiseScatter, echo=FALSE,fig.align="center",fig.cap="Figure 2: Pairwise comparison of samples (not produced when more than 12 samples).", out.width="1200px"} -#PairwiseScatter +```{r pairewiseScatter, echo=FALSE,fig.align="center",fig.cap="Figure 2: Pairwise comparison of samples (not produced when more than 12 samples).", out.width="1200px", fig.width=3*ncol(counts), fig.height=2*ncol(counts)} pairwiseScatterPlots(counts = counts, outfile = FALSE) ``` @@ -144,14 +142,12 @@ The main variability within the experiment is expected to come from biological d Figure 3 sample clustering based on normalized data. An euclidean distance is computed between samples, and the dendrogram is built upon the Ward criterion. We expect this dendrogram to group replicates and separate biological conditions. ```{r clusterplot, echo=FALSE,fig.align="center",fig.cap="Figure 3: Sample clustering based on normalized data.", out.width="600px", warning=FALSE, message=FALSE} -# Cluster plot clusterPlot(counts.trans = counts.trans,conditions = Conditions, outfile = FALSE) ``` Another way of visualizing the experiment variability is to look at the first principal components of the PCA, as shown on the figure 4. On this figure, the first principal component (PC1) is expected to separate samples from the different biological conditions, meaning that the biological variability is the main source of variance in the data. -```{r PCA, echo=FALSE,fig.align="center",fig.cap="Figure 4: First three components of a Principal Component Analysis, with percentages of variance associated with each axis.", out.width="1200px"} -# PCA plot +```{r PCA, echo=FALSE,fig.align="center",fig.cap="Figure 4: First three components of a Principal Component Analysis, with percentages of variance associated with each axis.", fig.height=4, out.width="1200px"} PCAPlot(counts.trans = counts.trans, conditions = Conditions, outfile = FALSE) ``` @@ -189,7 +185,7 @@ kable(sf, caption = "Table X: Normalization factors",format = "html") %>% Boxplots are often used as a qualitative measure of the quality of the normalization process, as they show how distributions are globally affected during this process. We expect normalization to stabilize distributions across samples. Figure 5 shows boxplots of raw (left) and normalized (right) data respectively. -```{r boxplot, echo=FALSE,fig.align="center",fig.cap="Figure 5: Boxplots of raw (left) and normalized (right) read counts.", out.width="1200px", warning=FALSE} +```{r boxplot, echo=FALSE,fig.align="center",fig.cap="Figure 5: Boxplots of raw (left) and normalized (right) read counts.", out.width="1200px", fig.height=4, warning=FALSE} countsBoxplots(results = resAnDif$results, conditions = Conditions, method = method) ``` @@ -197,7 +193,7 @@ countsBoxplots(results = resAnDif$results, conditions = Conditions, method = met ## 5.1. Dispersions estimation -```{r dispersionPlot, echo=FALSE, fig.align="center", fig.cap="Figure 6: Dispersion estimation", out.width="1200px", results="asis"} +```{r dispersionPlot, echo=FALSE, fig.align="center", fig.cap="Figure 6: Dispersion estimation", out.width="600px", results="asis"} if (method=="DESeq2") { cat("The DESeq2 model assumes that the count data follow a negative binomial distribution which is a robust alternative to the Poisson law when data are over-dispersed (the variance is higher than the mean). The first step of the statistical procedure is to estimate the dispersion of the data. Its purpose is to determine the shape of the mean-variance relationship. The default is to apply a GLM (Generalized Linear Model) based method (fitType='parametric'), which can handle complex designs but may not converge in some cases.\n") @@ -208,7 +204,7 @@ cat("The figure 6 shows the result of the dispersion estimation step. The x- and } ``` -```{r meanvar, echo=FALSE, results="asis", fig.align="center", fig.cap="Figure 6: Mean-variance trend", out.width="1200px"} +```{r meanvar, echo=FALSE, results="asis", fig.align="center", fig.cap="Figure 6: Mean-variance trend", out.width="600px"} if (method=="Limma") { cat("For the differential marking/binding analysis we use the limma approach to RNA-seq [@ritchie2015]. Read counts are converted to log2-counts-per-million (logCPM) and the mean-variance relationship is modelled either with precision weights or with an empirical Bayes prior trend. Here we use the the precision weights approach called “voom” [@law2014]. This transformation permits to apply the linear modelling in the limma package can be applied to sequencing data. The systematic variability of the data is modeled with a linear approach to differentiate it from the random variability. This linear modeling is very similar to classical ANOVA or multiple regression, except that a model is adapted to each peak.") @@ -225,7 +221,6 @@ cat("The figure 6 shows the result of the variance estimation step. The x- and y Figure 7 shows the distributions of raw p-values computed by the statistical test for the comparison(s) done. This distribution is expected to be a mixture of a uniform distribution on $[0,1]$ and a peak around 0 corresponding to the differentially expressed features. ```{r, echo=FALSE,fig.align="center",fig.cap="Figure 7: Distribution(s) of raw p-values", out.width="600px"} -# Pvalue Hist rawpHist(result = resAnDif$results, outfile = FALSE) ``` @@ -251,7 +246,6 @@ kable(df, caption = "Table 4: Normalization factors",format = "html") %>% Figure 8 represents the MA-plot of the data for the comparisons done, where differentially expressed features are highlighted in red. A MA-plot represents the log ratio of differential expression as a function of the mean intensity for each feature. Triangles correspond to features having a too low/high $\log_2(\text{FC})$ to be displayed on the plot. ```{r MAplot, echo=FALSE,fig.align="center",fig.cap="Figure 8: MA-plot(s) of each comparison. Red dots represent significantly differentially expressed features.", out.width="600px"} -# Producing MAplots MAPlot(results = resAnDif$results, method = method, alpha = alpha, outfile = FALSE) ``` diff --git a/man/densityPlot.Rd b/man/densityPlot.Rd deleted file mode 100644 index e2ddbd2..0000000 --- a/man/densityPlot.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/densityPlot.R -\name{densityPlot} -\alias{densityPlot} -\title{Density plot of all samples} -\usage{ -densityPlot( - counts, - conditions, - col = c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", "#c2b280", "#848482", - "#008856", "#e68fac", "#0067a5", "#f99379", "#604e97"), - outfile = TRUE -) -} -\arguments{ -\item{counts}{\code{matrix} of counts} - -\item{conditions}{factor vector of the condition from which each sample belongs} - -\item{col}{colors of the curves (one per biological condition)} - -\item{outfile}{TRUE to export the figure in a png file} -} -\value{ -A density plot for each sample -} -\description{ -Estimation the counts density for each sample -} -\author{ -Marie-Agnes Dillies, Hugo Varet and Maëlle Daunesse -} -- GitLab