diff --git a/DESCRIPTION b/DESCRIPTION index 29f0045d3fae1ca25c5b59ae80551b2bc8808072..c1936291bc6f3efda4cba7e2e1b5d9e86caeb635 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ChIPflowR Type: Package Title: Differential analysis of ChIP-Seq data -Version: 0.99.0 -Date: 2020-05-07 +Version: 0.99.1 +Date: 2022-03-02 Author: Marie-Agnes Dillies, Hugo Varet and Maëlle Daunesse Maintainer: Maëlle Daunesse <maelle.daunesse@pasteur.fr> Depends: R (>= 3.4.0), DESeq2, limma, kableExtra, knitr, ggplot2 (>= 3.3.0) @@ -15,4 +15,4 @@ Description: Provide R tools and an environment for the differential perform statistical analysis/testing with DESeq2 or limma, export results and create final report. License: GPL-2 -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.2 diff --git a/R/MAPlot.R b/R/MAPlot.R index 45de871021b839fc4aea048e9c7fe31bd5fa1013..a2f86198cf7f663a3815ebccf433da9d7accdb75 100755 --- a/R/MAPlot.R +++ b/R/MAPlot.R @@ -35,7 +35,8 @@ MAPlot <- function(results, method, alpha=0.05, outfile=FALSE, log2FClim=NULL){ scale_fill_manual(values=c("no"="black", "yes"="red"), drop=FALSE) + scale_y_continuous(expand=expansion(mult=c(0.03, 0.03))) + ylab(expression(log[2]~fold~change)) + - ggtitle(paste0("MA-plot - ", gsub("_"," ",name))) + ggtitle(paste0("MA-plot - ", gsub("_"," ",name))) + + theme_light() if (method=="DESeq2"){ p[[name]] <- p[[name]] + scale_x_continuous(trans = log10_trans(), diff --git a/R/PCAPlot.R b/R/PCAPlot.R index 7cc9ea0c89ac326be40c14bb2e3e1a5ef7f8652d..f1050ceca0c96f8a8cff627a969472ad60559bd6 100755 --- a/R/PCAPlot.R +++ b/R/PCAPlot.R @@ -28,12 +28,13 @@ PCAPlot <- function(counts.trans, conditions, n=min(500, nrow(counts.trans)), d <- data.frame(x=pca$x[,index1], y=pca$x[,index2], conditions=conditions, sample=factor(rownames(pca$x), levels=rownames(pca$x))) ggplot(data=d, aes(x=.data$x, y=.data$y, color=conditions, label=sample)) + - geom_point(show.legend=TRUE, size=3) + + geom_point(show.legend=FALSE, size=2) + labs(color="") + scale_colour_manual(values=col) + - geom_text_repel(show.legend=FALSE, size=5, point.padding=0.2) + + geom_text_repel(show.legend=FALSE, size=4, point.padding=0.2) + xlab(paste0("PC", index1, " (",prp[index1],"%)")) + - ylab(paste0("PC", index2, " (",prp[index2],"%)")) + ylab(paste0("PC", index2, " (",prp[index2],"%)")) + + theme_light() } p1 <- tmpFunction(c(1, 2)) p2 <- tmpFunction(c(1, 3)) diff --git a/R/barplotTotal.R b/R/barplotTotal.R index afa03a5d3b50671a04e4d45274e505c9899d7700..6d319d08ce11abce31153040ea03d940b30f5669 100755 --- a/R/barplotTotal.R +++ b/R/barplotTotal.R @@ -23,6 +23,7 @@ barplotTotal <- function(counts, conditions, ylab("Total read count (million)") + scale_y_continuous(expand=expansion(mult=c(0.01, 0.05))) + ggtitle("Total read count per sample (million)") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))) if (outfile) dev.off() } diff --git a/R/clusterPlot.R b/R/clusterPlot.R index 8ec9b12f778f0fd1134f943aa1ce70682997a022..f7eb793b1156b79d72258efbe1bef383f3d61b06 100755 --- a/R/clusterPlot.R +++ b/R/clusterPlot.R @@ -16,6 +16,7 @@ clusterPlot <- function(counts.trans, conditions, outfile=FALSE){ xlab("Samples") + ylab("Height") + ggtitle("Cluster dendrogram\nEuclidean distance, Ward criterion") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5), axis.text.y=element_text(angle=0)) + scale_y_continuous(expand=expansion(mult=c(0.01, 0.05)))) diff --git a/R/countsBoxplots.R b/R/countsBoxplots.R index 9a8b14bb326334c4c3b4a1c667b68ce7ac259881..a6d19dd6191cee7b8e0409d39907a455a17f4a5a 100755 --- a/R/countsBoxplots.R +++ b/R/countsBoxplots.R @@ -28,24 +28,26 @@ countsBoxplots <- function(results, conditions, method, d <- stack(as.data.frame(log2(as.matrix(counts)))) d$conditions <- rep(conditions, each=nrow(counts)) p1 <- ggplot(d) + - geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=TRUE) + + geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=FALSE) + labs(fill="") + scale_fill_manual(values=col) + xlab("Samples") + ylab("Raw counts (log scale)") + ggtitle("Raw counts distribution") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) # norm counts if (method=="DESeq2") norm.counts <- log2(as.matrix(norm.counts)) d <- stack(as.data.frame(norm.counts)) d$conditions <- rep(conditions, each=nrow(norm.counts)) p2 <- ggplot(d) + - geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=TRUE) + + geom_boxplot(aes(x=.data$ind, y=.data$values+1, fill=.data$conditions), show.legend=FALSE) + labs(fill="") + scale_fill_manual(values=col) + xlab("Samples") + ylab("Normalized counts (log scale)") + ggtitle("Normalized counts distribution") + + theme_light() + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5)) grid.arrange(p1, p2, nrow=1, ncol=2) if (outfile) dev.off() diff --git a/R/densityPlot.R b/R/densityPlot.R deleted file mode 100755 index e4ad71013fc0c5f8ddbd0786ca3769d6f4f5d28e..0000000000000000000000000000000000000000 --- a/R/densityPlot.R +++ /dev/null @@ -1,31 +0,0 @@ -#' Density plot of all samples -#' -#' Estimation the counts density for each sample -#' -#' @name densityPlot -#' @param counts \code{matrix} of counts -#' @param conditions factor vector of the condition from which each sample belongs -#' @param col colors of the curves (one per biological condition) -#' @param outfile TRUE to export the figure in a png file -#' @return A density plot for each sample -#' @author Marie-Agnes Dillies, Hugo Varet and Maëlle Daunesse - -densityPlot <- function(counts, conditions, - col=c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", "#c2b280", "#848482", "#008856", "#e68fac", "#0067a5", "#f99379", "#604e97"), - outfile=TRUE){ - if (outfile) png(filename="Densityplot_DESeq2.png",width=1800,height=1800,res=300) - counts <- removeNull(counts) - d <- stack(data.frame(counts)) - d$conditions <- rep(conditions, each=nrow(counts)) - print(ggplot(d, aes(x=.data$values+1)) + - stat_density(aes(group=.data$ind, color=.data$conditions), position="identity", geom="line", show.legend=TRUE) + - scale_x_continuous(trans = log10_trans(), - breaks = trans_breaks("log10", function(x) 10^x), - labels = trans_format("log10", math_format(~10^.x))) + - labs(color="") + - scale_colour_manual(values=col) + - xlab("Raw counts") + - ylab("Density") + - ggtitle("Density of counts distribution")) - if (outfile) dev.off() -} diff --git a/R/dispersionsPlot.R b/R/dispersionsPlot.R index e1b37a0974dfdf41bc859431aab0233c4357bb0f..a2ef6ce36a0de0d7a877c2b0a5829080fa30e3d2 100755 --- a/R/dispersionsPlot.R +++ b/R/dispersionsPlot.R @@ -32,7 +32,8 @@ dispersionsPlot <- function(dds, outfile=FALSE){ labels=c("Estimate", "Final", "Fit"), name="") + guides(colour = guide_legend(override.aes = list(size=2))) + - ggtitle("Dispersions") + ggtitle("Dispersions") + + theme_light() print(p) if (outfile) dev.off() } diff --git a/R/pairwiseScatterPlots.R b/R/pairwiseScatterPlots.R index f5cff420d36cdb1c1f035ed2a160a23620cf1761..0439290cf484a8c2af45ea6143e93f986c610fcf 100755 --- a/R/pairwiseScatterPlots.R +++ b/R/pairwiseScatterPlots.R @@ -28,15 +28,16 @@ pairwiseScatterPlots <- function(counts, outfile=FALSE){ scale_y_continuous(trans = log10_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(~10^.x))) + - xlab(names(d)[i]) + ylab(names(d)[j]) + - geom_abline(slope=1, intercept=0, linetype="dashed", col="lightgrey") + xlab(names(d)[i]) + + ylab(names(d)[j]) + + geom_abline(slope=1, intercept=0, linetype="dashed", col="lightgrey") + + theme_light() } else{ text = as.character(round(SERE(counts[,c(i,j)]), digits=2)) p[[k]] <- ggplot() + annotate("text", x=4, y=25, size=6, label=text) + theme_bw() + xlab(names(d)[i]) + ylab(names(d)[j]) + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(), - axis.text.x=element_blank(), axis.ticks.x=element_blank(), - axis.text.y=element_blank(), axis.ticks.y=element_blank()) + axis.text=element_blank(), axis.ticks=element_blank()) } layout_matrix[j,i] <- k k <- k+1 diff --git a/R/rawpHist.R b/R/rawpHist.R index 99a3b4e84c40aaf6b21be135032b2d61bf71896d..8d7787c157ac0cc73df059817a866014a94bc5fa 100755 --- a/R/rawpHist.R +++ b/R/rawpHist.R @@ -20,7 +20,8 @@ rawpHist <- function(result, outfile=FALSE){ scale_y_continuous(expand=expansion(mult=c(0.01, 0.05))) + xlab("Raw p-value") + ylab("Frequency") + - ggtitle(paste0("Distribution of raw p-values - ", gsub("_"," ",name))) + ggtitle(paste0("Distribution of raw p-values - ", gsub("_"," ",name))) + + theme_light() } tmpfun <- function(...) grid.arrange(..., nrow=nrow, ncol=ncol) do.call(tmpfun, p) diff --git a/inst/Report_ChIPflowR.Rmd b/inst/Report_ChIPflowR.Rmd index 72f9a899e7034f234a460b169bbd3411e6d8f4f5..022d1e2e6f3153ef1791f46393226726be1986ed 100644 --- a/inst/Report_ChIPflowR.Rmd +++ b/inst/Report_ChIPflowR.Rmd @@ -122,7 +122,6 @@ kable(apply(counts,2,fun_summary), caption = "Table 3: Summary of the raw counts Figure 1 shows the total number of mapped and counted reads for each sample. Total read counts are expected to be similar within conditions, they may be different across conditions. Total counts sometimes vary widely between replicates. ```{r barplot, echo=FALSE,fig.align="center",fig.cap="Figure 1: Number of mapped reads per sample. Colors refer to the biological condition of the sample.", out.width="600px"} -# Producing Barplot barplotTotal(counts = counts, conditions = Conditions) ``` @@ -132,8 +131,7 @@ A pairwise scatter plot is produced (figure 2) to show how replicates and sample - 1 for technical replicates (technical variability follows a Poisson distribution) - greater than 1 for biological replicates and samples from different biological conditions (biological variability is higher than technical one, data are over-dispersed with respect to Poisson). The higher the SERE value, the lower the similarity. It is expected to be lower between biological replicates than between samples of different biological conditions. Hence, the SERE statistic can be used to detect inversions between samples. -```{r pairewiseScatter, echo=FALSE,fig.align="center",fig.cap="Figure 2: Pairwise comparison of samples (not produced when more than 12 samples).", out.width="1200px"} -#PairwiseScatter +```{r pairewiseScatter, echo=FALSE,fig.align="center",fig.cap="Figure 2: Pairwise comparison of samples (not produced when more than 12 samples).", out.width="1200px", fig.width=3*ncol(counts), fig.height=2*ncol(counts)} pairwiseScatterPlots(counts = counts, outfile = FALSE) ``` @@ -144,14 +142,12 @@ The main variability within the experiment is expected to come from biological d Figure 3 sample clustering based on normalized data. An euclidean distance is computed between samples, and the dendrogram is built upon the Ward criterion. We expect this dendrogram to group replicates and separate biological conditions. ```{r clusterplot, echo=FALSE,fig.align="center",fig.cap="Figure 3: Sample clustering based on normalized data.", out.width="600px", warning=FALSE, message=FALSE} -# Cluster plot clusterPlot(counts.trans = counts.trans,conditions = Conditions, outfile = FALSE) ``` Another way of visualizing the experiment variability is to look at the first principal components of the PCA, as shown on the figure 4. On this figure, the first principal component (PC1) is expected to separate samples from the different biological conditions, meaning that the biological variability is the main source of variance in the data. -```{r PCA, echo=FALSE,fig.align="center",fig.cap="Figure 4: First three components of a Principal Component Analysis, with percentages of variance associated with each axis.", out.width="1200px"} -# PCA plot +```{r PCA, echo=FALSE,fig.align="center",fig.cap="Figure 4: First three components of a Principal Component Analysis, with percentages of variance associated with each axis.", fig.height=4, out.width="1200px"} PCAPlot(counts.trans = counts.trans, conditions = Conditions, outfile = FALSE) ``` @@ -189,7 +185,7 @@ kable(sf, caption = "Table X: Normalization factors",format = "html") %>% Boxplots are often used as a qualitative measure of the quality of the normalization process, as they show how distributions are globally affected during this process. We expect normalization to stabilize distributions across samples. Figure 5 shows boxplots of raw (left) and normalized (right) data respectively. -```{r boxplot, echo=FALSE,fig.align="center",fig.cap="Figure 5: Boxplots of raw (left) and normalized (right) read counts.", out.width="1200px", warning=FALSE} +```{r boxplot, echo=FALSE,fig.align="center",fig.cap="Figure 5: Boxplots of raw (left) and normalized (right) read counts.", out.width="1200px", fig.height=4, warning=FALSE} countsBoxplots(results = resAnDif$results, conditions = Conditions, method = method) ``` @@ -197,7 +193,7 @@ countsBoxplots(results = resAnDif$results, conditions = Conditions, method = met ## 5.1. Dispersions estimation -```{r dispersionPlot, echo=FALSE, fig.align="center", fig.cap="Figure 6: Dispersion estimation", out.width="1200px", results="asis"} +```{r dispersionPlot, echo=FALSE, fig.align="center", fig.cap="Figure 6: Dispersion estimation", out.width="600px", results="asis"} if (method=="DESeq2") { cat("The DESeq2 model assumes that the count data follow a negative binomial distribution which is a robust alternative to the Poisson law when data are over-dispersed (the variance is higher than the mean). The first step of the statistical procedure is to estimate the dispersion of the data. Its purpose is to determine the shape of the mean-variance relationship. The default is to apply a GLM (Generalized Linear Model) based method (fitType='parametric'), which can handle complex designs but may not converge in some cases.\n") @@ -208,7 +204,7 @@ cat("The figure 6 shows the result of the dispersion estimation step. The x- and } ``` -```{r meanvar, echo=FALSE, results="asis", fig.align="center", fig.cap="Figure 6: Mean-variance trend", out.width="1200px"} +```{r meanvar, echo=FALSE, results="asis", fig.align="center", fig.cap="Figure 6: Mean-variance trend", out.width="600px"} if (method=="Limma") { cat("For the differential marking/binding analysis we use the limma approach to RNA-seq [@ritchie2015]. Read counts are converted to log2-counts-per-million (logCPM) and the mean-variance relationship is modelled either with precision weights or with an empirical Bayes prior trend. Here we use the the precision weights approach called “voom” [@law2014]. This transformation permits to apply the linear modelling in the limma package can be applied to sequencing data. The systematic variability of the data is modeled with a linear approach to differentiate it from the random variability. This linear modeling is very similar to classical ANOVA or multiple regression, except that a model is adapted to each peak.") @@ -225,7 +221,6 @@ cat("The figure 6 shows the result of the variance estimation step. The x- and y Figure 7 shows the distributions of raw p-values computed by the statistical test for the comparison(s) done. This distribution is expected to be a mixture of a uniform distribution on $[0,1]$ and a peak around 0 corresponding to the differentially expressed features. ```{r, echo=FALSE,fig.align="center",fig.cap="Figure 7: Distribution(s) of raw p-values", out.width="600px"} -# Pvalue Hist rawpHist(result = resAnDif$results, outfile = FALSE) ``` @@ -251,7 +246,6 @@ kable(df, caption = "Table 4: Normalization factors",format = "html") %>% Figure 8 represents the MA-plot of the data for the comparisons done, where differentially expressed features are highlighted in red. A MA-plot represents the log ratio of differential expression as a function of the mean intensity for each feature. Triangles correspond to features having a too low/high $\log_2(\text{FC})$ to be displayed on the plot. ```{r MAplot, echo=FALSE,fig.align="center",fig.cap="Figure 8: MA-plot(s) of each comparison. Red dots represent significantly differentially expressed features.", out.width="600px"} -# Producing MAplots MAPlot(results = resAnDif$results, method = method, alpha = alpha, outfile = FALSE) ``` diff --git a/man/densityPlot.Rd b/man/densityPlot.Rd deleted file mode 100644 index e2ddbd2b71dc7de83fa51c8e21e6b8d2c5ccbb67..0000000000000000000000000000000000000000 --- a/man/densityPlot.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/densityPlot.R -\name{densityPlot} -\alias{densityPlot} -\title{Density plot of all samples} -\usage{ -densityPlot( - counts, - conditions, - col = c("#f3c300", "#875692", "#f38400", "#a1caf1", "#be0032", "#c2b280", "#848482", - "#008856", "#e68fac", "#0067a5", "#f99379", "#604e97"), - outfile = TRUE -) -} -\arguments{ -\item{counts}{\code{matrix} of counts} - -\item{conditions}{factor vector of the condition from which each sample belongs} - -\item{col}{colors of the curves (one per biological condition)} - -\item{outfile}{TRUE to export the figure in a png file} -} -\value{ -A density plot for each sample -} -\description{ -Estimation the counts density for each sample -} -\author{ -Marie-Agnes Dillies, Hugo Varet and Maëlle Daunesse -}