diff --git a/README.md b/README.md index f6c616275e1cc1324127ddcc73c2ecb0f03646d5..f15e3a1cb032ad0c476ca95acee447827b9f2669 100644 --- a/README.md +++ b/README.md @@ -37,8 +37,11 @@ In addition to above tools, you need to install pipeline-related tools: - samtools - bowtie2 - bedtools -- R - +- R (>= 4.0.2) + - ggplot2 + - tidyverse + - gggenes + - rtracklayer ## How to run RNAsig ? diff --git a/Snakefile b/Snakefile index 74960f89deb8e4352a7fe28217d61a0d796b5cf2..e1e05510a7020ef78c3d0229e99a32fb11ee2625 100644 --- a/Snakefile +++ b/Snakefile @@ -46,7 +46,6 @@ receptor = [ x.strip() for x in (config["design"]["receptor"]).split(",")] conds = [ x.strip() for x in (config["design"]["condition"]).split(",")] - #------------------------------------------------------- # paired-end data gestion read_tag = config["input_readtag"] @@ -124,18 +123,17 @@ else: ## Indexing genome - ref = [ config["genome"]["name"] ] if config["genome"]["host_mapping"]: - ref += config["genome"]["host_name"] + ref += [ config["genome"]["host_name"] ] def mapping_index(wildcards): if (wildcards.REF == config["genome"]["name"]): input = config["genome"]["fasta_file"] - elif (wildcards.REF == "spikes"): + elif (wildcards.REF == config["genome"]["host_name"]): input = config["genome"]["host_fasta_file"] return(input) @@ -185,20 +183,19 @@ include: os.path.join(RULES, "genomecov.rules") # Multiqc rule -if config['multiqc']['config_file']: - config['multiqc']['options'] += " -c " + config['multiqc']['config_file'] +config['multiqc']['options'] += " -c config/multiqc_config.yaml" __multiqc__input = expected_output __multiqc__input_dir = "." __multiqc__logs = "04-Multiqc/multiqc.log" -__multiqc__output = config['multiqc']['output-directory'] +__multiqc__output = "04-Multiqc" __multiqc__options = config['multiqc']['options'] -__multiqc__output_dir = config['multiqc']['output-directory'] +__multiqc__output_dir = "04-Multiqc" expected_output = [__multiqc__output] include: os.path.join(RULES, "multiqc.rules") -rule chipuana: +rule rnasig: input: expected_output diff --git a/config/config.yaml b/config/config.yaml index 3915aafbf6a1bbbe86f80beb10eb6fafcdb9a0f1..def53a0c4c07293ac1942e82086ef0ae394a5874 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -117,8 +117,6 @@ adapters: quality: 30 threads: 4 - - #=============================================================================== # bowtie2_mapping used to align reads against genome file # @@ -133,10 +131,6 @@ bowtie2_mapping: options: "--very-sensitive " threads: 4 - - - - #=============================================================================== # MultiQC aggregates results from bioinformatics analyses across many # samples into a single report. @@ -144,12 +138,8 @@ bowtie2_mapping: # :Parameters: # # - options: any options recognised by multiqc -# - output-directory: Create report in the specified output directory -# - config_file: path to config file in yaml #=============================================================================== multiqc: options: " -f " - output-directory: "04-Multiqc" - config_file: "multiqc_config.yaml" diff --git a/workflow/.gitkeep b/workflow/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/rules/.gitkeep b/workflow/rules/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/rules/adapters.rules b/workflow/rules/adapters.rules new file mode 100755 index 0000000000000000000000000000000000000000..89c5e051aa929be69e2ac3c46ffc971158b997e2 --- /dev/null +++ b/workflow/rules/adapters.rules @@ -0,0 +1,66 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + +rule adapters: + input: + fastq = __adapters__input_fastq + output: + __adapters__output + params: + wkdir = __adapters__wkdir, + options = __adapters__options, + adapters = __adapters__adapt_list, + mode = __adapters__mode, + min = __adapters_min, + qual = __adapters_qual + singularity: + "chipuana.img" + threads: config['adapters']['threads'] if 'threads' in config['adapters'] else 4 + log: + __adapters__log + shell: + """ + set +o pipefail + + tmp="{input}" + infiles=($tmp) + + tmp="{output}" + outfiles=($tmp) + + + cmd="cutadapt " + # add mode and adapter sequences + cmd+=" -{params.mode} {params.adapters} -m {params.min} -q {params.qual} {params.options} " + # paired end or single end + if [[ ${{#infiles[@]}} -eq 2 ]]; + then + cmd+=" -o ${{outfiles[0]}} -p ${{outfiles[1]}} ${{infiles[0]}} ${{infiles[1]}} " + else + cmd+=" -o ${{outfiles[0]}} ${{infiles[0]}}" + fi + #run command + eval "${{cmd}} > {log}" + + """ diff --git a/workflow/rules/bowtie2_index.rules b/workflow/rules/bowtie2_index.rules new file mode 100755 index 0000000000000000000000000000000000000000..ad02e620131d78019b9c13f5129ae85ba81c4e46 --- /dev/null +++ b/workflow/rules/bowtie2_index.rules @@ -0,0 +1,54 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + +rule bowtie2_index: + """ + Genome indexation for Bowtie2 mapper + + Required input: + __bowtie2_index__fasta: the reference genome to indexed in FASTA format + + Required output: + __bowtie2_index__output_done: done file for bowtie2 mapping rule + + params: + __bowtie2_index__output_prefix: the directory where write the index + + """ + input: + fasta = __bowtie2_index__fasta + output: + __bowtie2_index__output_done + singularity: + "rnasig.img" + params: + prefix = __bowtie2_index__output_prefix + log: + __bowtie2_index__log + shell: + """ + bowtie2-build {input.fasta} {params.prefix} 2> {log} + samtools faidx {input.fasta} 2>> {log} + + """ diff --git a/workflow/rules/bowtie2_mapping.rules b/workflow/rules/bowtie2_mapping.rules new file mode 100755 index 0000000000000000000000000000000000000000..9e6d2702caa38369e52697e08f8d0ea32229c2c2 --- /dev/null +++ b/workflow/rules/bowtie2_mapping.rules @@ -0,0 +1,71 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + +rule bowtie2_mapping: + input: + fastq = __bowtie2_mapping__input, + index = __bowtie2_mapping__index_done + output: + sort = __bowtie2_mapping__sort, + bam = temp(__bowtie2_mapping__bam) + singularity: + "rnasig.img" + log: + err = __bowtie2_mapping__logs_err, + out = __bowtie2_mapping__logs_out + params: + prefix_index = __bowtie2_mapping__prefix_index, + options = __bowtie2_mapping__options, + prefix = temp(__bowtie2_mapping__sortprefix) + threads: + config["bowtie2_mapping"]["threads"] + shell: + """ + set +o pipefail + tmp="{input.fastq}" + infiles=($tmp) + + cmd="bowtie2 -p {threads} {params.options} -x {params.prefix_index}" + # paired end or single end + if [[ ${{#infiles[@]}} -eq 2 ]] + then + bowtie_input="-1 ${{infiles[0]}} -2 ${{infiles[1]}}" + else + bowtie_input="-U ${{infiles[0]}} " + fi + + cmd+=" ${{bowtie_input}}" + # sam to bam + cmd+=" | samtools view -Sbh - > {output.bam}" + + # logs + cmd="(${{cmd}}) > {log.out} 2> {log.err}" + + # sort result + cmd+=" && samtools sort -o {output.bam} {params.prefix} > {output.sort} " + cmd+=" && samtools index {output.sort}" + + #run command + eval "${{cmd}}" + """ diff --git a/workflow/rules/fastqc.rules b/workflow/rules/fastqc.rules new file mode 100644 index 0000000000000000000000000000000000000000..df704bac507bea3266b943b02b45c74678fa990a --- /dev/null +++ b/workflow/rules/fastqc.rules @@ -0,0 +1,41 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + +rule fastqc: + input: + fastq = __fastqc__input_fastq + output: + touch(__fastqc__output_done) + singularity: + "rnasig.img" + params: + wkdir = __fastqc__wkdir, + kargs = config['fastqc']['options'] + threads: config['fastqc']['threads'] if 'threads' in config['fastqc'] else 4 + log: + fastqc = __fastqc__log + shell: + "fastqc -t {threads} --outdir {params.wkdir} -f fastq {input.fastq} {params.kargs} > {log.fastqc}" + + diff --git a/workflow/rules/genomecov.rules b/workflow/rules/genomecov.rules new file mode 100644 index 0000000000000000000000000000000000000000..04f24ea622f22be11136ebedf048d491300ba81f --- /dev/null +++ b/workflow/rules/genomecov.rules @@ -0,0 +1,54 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + + +rule genomecov: + """ + + """ + + input: + __genomecov__input + log: + __genomecov__logs + output: + __genomecov__output + singularity: + "rnasig.img" + params: + genome = __genomecov__genome, + options = __genomecov__options + shell: + """ + if [[ {input} == *"minus"* ]] ; then + strand="-" + else + strand="+" + fi + + bedtools genomecov {params.options} -ibam {input} -g {params.genome} -d -strand ${{strand}} > {output} 2> {log} + """ + + + diff --git a/workflow/rules/multiqc.rules b/workflow/rules/multiqc.rules new file mode 100644 index 0000000000000000000000000000000000000000..e5b2798420317a541168051a511c2be17d7f8894 --- /dev/null +++ b/workflow/rules/multiqc.rules @@ -0,0 +1,69 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + +rule multiqc: + """ + MultiQC aggregates results from bioinformatics analyses across many + samples into a single report. + + It searches a given directory for analysis logs and compiles a HTML + report. It's a general use tool, perfect for summarising the output from + numerous bioinformatics tools. + + :reference: http://multiqc.info/ + + Required input: + __multiqc__input_dir: an input directory where to find data and logs + + Required output: + __multiqc__output: multiqc_report.html in the input directory + + Config: + + .. code-block:: yaml + + multiqc: + options: "-c multiqc_config.yaml -f -x *.zip -e htseq" #any options recognised by multiqc + output-directory: " " #name of the output directory where to write results + + :note: if the directory exists, it is overwritten + """ + + input: + __multiqc__input + log: + __multiqc__logs + output: + __multiqc__output + singularity: + "rnasig.img" + params: + inputdir = __multiqc__input_dir, + options = __multiqc__options, + outdir = __multiqc__output_dir + shell: + """ + multiqc {params.inputdir} -o {params.outdir} {params.options} 2> {log} + """ + + diff --git a/workflow/rules/rnasig b/workflow/rules/rnasig new file mode 100644 index 0000000000000000000000000000000000000000..7771c7a140ec02cd42dae665d978d75640068683 --- /dev/null +++ b/workflow/rules/rnasig @@ -0,0 +1,170 @@ +######################################################################### +# RNASig # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2019-2020 Institut Pasteur (Paris) and CNRS. # +# # +# This file is part of RNASig workflow. # +# # +# RNASig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNASig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNASig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + +rule rnasig: + """ + RNAsig rule ... + """ + input: + target = __rnasig__target + params: + outdir = __rnasig__output_dir, + directory = __rnasig__covdir, + gff = __rnasig__gfffile + + output: + report = __rnasig__report + + singularity: + "rnasig.img" + run: + R(""" + #load libraries + library(ggplot2) + library(tidyverse) + library(gggenes) + library(rtracklayer) + + ##### Variable to changed + WDIR = {params.outdir} + target_file = {input.target} + rawDir = {params.covdir} + gff_file <- {params.gff} + + ######################### + + setwd(WDIR) + #Read target file + target <- read.table(target_file, header=TRUE, na.strings="", skip=0) + + + #View(target) + #create labels and select files + labels <- within(target, id <- paste(target[,2],target[,3],target[,4],target[,5], sep="_")) + labels <- as.character(labels$id) + files <- as.character(target[,1]) + + #get counts for the first sample + rawCounts <- read.table(file.path(rawDir, files[1]), sep="\t", + quote="\"", header=FALSE, skip=0, stringsAsFactors=FALSE) + rawCounts$V1 <- NULL + colnames(rawCounts) <- c("Position", labels[1]) + head(rawCounts) + cat(files[1],": ",length(rawCounts[,labels[1]])," rows and ",sum(rawCounts[,labels[1]]==0)," null count(s)\n",sep="") + + #get count for all samples + for (i in 2:length(files)){ + tmp <- read.table(file.path(rawDir, files[i]), sep="\t", + quote="\"", header=FALSE, skip=0, stringsAsFactors=FALSE) + tmp$V1 <- NULL + colnames(tmp) <- c("Position", labels[i]) + rawCounts <- merge(rawCounts, tmp, by="Position", all=TRUE) + cat(files[i],": ",length(tmp[,labels[i]])," rows and ",sum(tmp[,labels[i]]==0)," null count(s)\n",sep="") + } + summary(rawCounts) + + + + + #Normalization of bead samples by corresponding total samples + + df <- rawCounts %>% + pivot_longer(cols = matches("_"), + names_to = c("Cond", "Receptor","Replicate", "strand"), + names_pattern = "(.+)_(.+)_(.+)_(.+)") %>% + group_by(Receptor, Replicate, strand) %>% + #Normalization of bead samples by corresponding total mean samples + dplyr::mutate(value_Total_mean = mean(ifelse(Cond == "Total", value, NA), na.rm = TRUE), + norm_Total = value/value_Total_mean) %>% + filter(Cond != "Total") %>% + group_by(Cond, strand, Position) %>% + #Normalization of each "Receptor" Sample by Cherry mean + dplyr::mutate(value_Cherry_mean = mean(ifelse(Receptor == "Cherry", norm_Total, NA), na.rm = TRUE), + norm_Cherry = norm_Total/value_Cherry_mean) %>% + group_by(Position, Cond, Receptor, strand) %>% + dplyr::mutate(norm_mean = mean(norm_Cherry, na.rm = TRUE), + norm_min = min(norm_Cherry, na.rm = TRUE), + norm_max = max(norm_Cherry, na.rm = TRUE)) %>% + ungroup() %>% + select(-Replicate, -value_Total_mean, -value, -value_Cherry_mean, -norm_Cherry) %>% + distinct() + + ##### plot coverage + colors <- c("#f38400", "#008856", "#be0032", "#a1caf1", "#f3c300", + "#875692", "#c2b280", "#848482", "#e68fac", "#0067a5", + "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", + "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26") + + + g1 <- df %>% + mutate(strand = fct_relevel(strand, "plus", "minus")) %>% + ggplot(aes(x = Position, y = norm_mean)) + + geom_smooth(method='loess', span=0.02, se=FALSE, na.rm=TRUE, alpha=0.4, size=0.5, aes(color=Receptor)) + + geom_ribbon(alpha = 0.4, aes(fill=Receptor, ymin=norm_min, ymax = norm_max)) + + scale_color_manual(values=colors) + + scale_fill_manual( values=colors ) + + ylab("RLR binding (FC)") + + xlab(" ") + ylim(0,4) + + # ggtitle("RLR binding along viral genome") + + theme_bw()+ + facet_wrap(~strand, ncol = 1) + + g1 + + ggsave("Coverage_only.png", width = 10, height = 5) + + + ###ADD annotation + my_columns <- c("seqid", "start", "end", "strand", "type") + my_filter <- list(type="gene") + my_tags=c("seqid", "start", "end", "strand", "type") + genes <- readGFF(gff_file, version=0, + columns=my_columns, tags=NULL, filter=my_filter, nrows=-1) + + + + + g2 <- ggplot(genes, aes(xmin = start, xmax = end, y = strand, fill = Name, label=Name)) + + geom_gene_arrow() + + geom_gene_label(align = "left") + + xlab("Position in basepair") + + ylab("Annotation ") + + scale_fill_brewer(palette = "Set3") + + theme_genes() + + theme(legend.position = "none") + + + g2 + + plot1 <- ggplot2::ggplotGrob(g1) + plot2 <- ggplot2::ggplotGrob(g2) + maxWidth <- grid::unit.pmax(plot1$widths, plot2$widths) + plot1$widths <- as.list(maxWidth) + plot2$widths <- as.list(maxWidth) + + + pdf({output.report}) + plot <- gridExtra::grid.arrange(plot1, plot2, ncol=1, heights = c(12, 5)) + dev.off() + """) diff --git a/workflow/rules/rnasig.rules b/workflow/rules/rnasig.rules new file mode 100644 index 0000000000000000000000000000000000000000..f976087663bf5006783049d4692ec5a42b32b2b8 --- /dev/null +++ b/workflow/rules/rnasig.rules @@ -0,0 +1,169 @@ +######################################################################### +# RNAsig: an automated pipeline to detect RNA signatures on viruses # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2020-2021 Institut Pasteur (Paris). # +# # +# This file is part of RNAsig workflow. # +# # +# RNAsig is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAsig is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAsig (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + +rule rnasig: + """ + RNAsig rule ... + """ + input: + target = __rnasig__target + params: + outdir = __rnasig__output_dir, + directory = __rnasig__covdir, + gff = __rnasig__gfffile + + output: + report = __rnasig__report + + singularity: + "rnasig.img" + run: + R(""" + #load libraries + library(ggplot2) + library(tidyverse) + library(gggenes) + library(rtracklayer) + + ##### Variable to changed + WDIR = {params.outdir} + target_file = {input.target} + rawDir = {params.covdir} + gff_file <- {params.gff} + + ######################### + + setwd(WDIR) + #Read target file + target <- read.table(target_file, header=TRUE, na.strings="", skip=0) + + + #View(target) + #create labels and select files + labels <- within(target, id <- paste(target[,2],target[,3],target[,4],target[,5], sep="_")) + labels <- as.character(labels$id) + files <- as.character(target[,1]) + + #get counts for the first sample + rawCounts <- read.table(file.path(rawDir, files[1]), sep="\t", + quote="\"", header=FALSE, skip=0, stringsAsFactors=FALSE) + rawCounts$V1 <- NULL + colnames(rawCounts) <- c("Position", labels[1]) + head(rawCounts) + cat(files[1],": ",length(rawCounts[,labels[1]])," rows and ",sum(rawCounts[,labels[1]]==0)," null count(s)\n",sep="") + + #get count for all samples + for (i in 2:length(files)){ + tmp <- read.table(file.path(rawDir, files[i]), sep="\t", + quote="\"", header=FALSE, skip=0, stringsAsFactors=FALSE) + tmp$V1 <- NULL + colnames(tmp) <- c("Position", labels[i]) + rawCounts <- merge(rawCounts, tmp, by="Position", all=TRUE) + cat(files[i],": ",length(tmp[,labels[i]])," rows and ",sum(tmp[,labels[i]]==0)," null count(s)\n",sep="") + } + summary(rawCounts) + + + + + #Normalization of bead samples by corresponding total samples + + df <- rawCounts %>% + pivot_longer(cols = matches("_"), + names_to = c("Cond", "Receptor","Replicate", "strand"), + names_pattern = "(.+)_(.+)_(.+)_(.+)") %>% + group_by(Receptor, Replicate, strand) %>% + #Normalization of bead samples by corresponding total mean samples + dplyr::mutate(value_Total_mean = mean(ifelse(Cond == "Total", value, NA), na.rm = TRUE), + norm_Total = value/value_Total_mean) %>% + filter(Cond != "Total") %>% + group_by(Cond, strand, Position) %>% + #Normalization of each "Receptor" Sample by Cherry mean + dplyr::mutate(value_Cherry_mean = mean(ifelse(Receptor == "Cherry", norm_Total, NA), na.rm = TRUE), + norm_Cherry = norm_Total/value_Cherry_mean) %>% + group_by(Position, Cond, Receptor, strand) %>% + dplyr::mutate(norm_mean = mean(norm_Cherry, na.rm = TRUE), + norm_min = min(norm_Cherry, na.rm = TRUE), + norm_max = max(norm_Cherry, na.rm = TRUE)) %>% + ungroup() %>% + select(-Replicate, -value_Total_mean, -value, -value_Cherry_mean, -norm_Cherry) %>% + distinct() + + ##### plot coverage + colors <- c("#f38400", "#008856", "#be0032", "#a1caf1", "#f3c300", + "#875692", "#c2b280", "#848482", "#e68fac", "#0067a5", + "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", + "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26") + + + g1 <- df %>% + mutate(strand = fct_relevel(strand, "plus", "minus")) %>% + ggplot(aes(x = Position, y = norm_mean)) + + geom_smooth(method='loess', span=0.02, se=FALSE, na.rm=TRUE, alpha=0.4, size=0.5, aes(color=Receptor)) + + geom_ribbon(alpha = 0.4, aes(fill=Receptor, ymin=norm_min, ymax = norm_max)) + + scale_color_manual(values=colors) + + scale_fill_manual( values=colors ) + + ylab("RLR binding (FC)") + + xlab(" ") + ylim(0,4) + + # ggtitle("RLR binding along viral genome") + + theme_bw()+ + facet_wrap(~strand, ncol = 1) + + g1 + + ggsave("Coverage_only.png", width = 10, height = 5) + + + ###ADD annotation + my_columns <- c("seqid", "start", "end", "strand", "type") + my_filter <- list(type="gene") + my_tags=c("seqid", "start", "end", "strand", "type") + genes <- readGFF(gff_file, version=0, + columns=my_columns, tags=NULL, filter=my_filter, nrows=-1) + + + + + g2 <- ggplot(genes, aes(xmin = start, xmax = end, y = strand, fill = Name, label=Name)) + + geom_gene_arrow() + + geom_gene_label(align = "left") + + xlab("Position in basepair") + + ylab("Annotation ") + + scale_fill_brewer(palette = "Set3") + + theme_genes() + + theme(legend.position = "none") + + + g2 + + plot1 <- ggplot2::ggplotGrob(g1) + plot2 <- ggplot2::ggplotGrob(g2) + maxWidth <- grid::unit.pmax(plot1$widths, plot2$widths) + plot1$widths <- as.list(maxWidth) + plot2$widths <- as.list(maxWidth) + + + pdf({output.report}) + plot <- gridExtra::grid.arrange(plot1, plot2, ncol=1, heights = c(12, 5)) + dev.off() + """) diff --git a/workflow/scripts/.gitkeep b/workflow/scripts/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/workflow/scripts/RNAsig.R b/workflow/scripts/RNAsig.R new file mode 100644 index 0000000000000000000000000000000000000000..cd9bf904bfd9c3a6321d2e2066d9ab794e223d95 --- /dev/null +++ b/workflow/scripts/RNAsig.R @@ -0,0 +1,125 @@ +#load libraries +library(ggplot2) +library(tidyverse) +library(gggenes) +library(rtracklayer) + +##### Variable to change +WDIR = "~/Documents/projets/15066_Komarova" +target_file = "design_strand.txt" +rawDir = "Coverage" +gff_file <- "~/Documents/projets/15066_Komarova/measle.gff3" + +######################### + +setwd(WDIR) +#Read target file +target <- read.table(target_file, header=TRUE, na.strings="", skip=0) # ,sep="\t", fill=TRUE ) + + +#View(target) +#create labels and select files +labels <- within(target, id <- paste(target[,2],target[,3],target[,4],target[,5], sep="_")) +labels <- as.character(labels$id) +files <- as.character(target[,1]) + +#get counts for the first sample +rawCounts <- read.table(file.path(rawDir, files[1]), sep="\t", + quote="\"", header=FALSE, skip=0, stringsAsFactors=FALSE) +rawCounts$V1 <- NULL +colnames(rawCounts) <- c("Position", labels[1]) +head(rawCounts) +cat(files[1],": ",length(rawCounts[,labels[1]])," rows and ",sum(rawCounts[,labels[1]]==0)," null count(s)\n",sep="") + +#get count for all samples +for (i in 2:length(files)){ + tmp <- read.table(file.path(rawDir, files[i]), sep="\t", + quote="\"", header=FALSE, skip=0, stringsAsFactors=FALSE) + tmp$V1 <- NULL + colnames(tmp) <- c("Position", labels[i]) + rawCounts <- merge(rawCounts, tmp, by="Position", all=TRUE) + cat(files[i],": ",length(tmp[,labels[i]])," rows and ",sum(tmp[,labels[i]]==0)," null count(s)\n",sep="") +} +summary(rawCounts) + + + + +#Normalization of bead samples by corresponding total samples + +df <- rawCounts %>% + pivot_longer(cols = matches("_"), + names_to = c("Cond", "Receptor","Replicate", "strand"), + names_pattern = "(.+)_(.+)_(.+)_(.+)") %>% + group_by(Receptor, Replicate, strand) %>% + #Normalization of bead samples by corresponding total mean samples + dplyr::mutate(value_Total_mean = mean(ifelse(Cond == "Total", value, NA), na.rm = TRUE), + norm_Total = value/value_Total_mean) %>% + filter(Cond != "Total") %>% + group_by(Cond, strand, Position) %>% + #Normalization of each "Receptor" Sample by Cherry mean + dplyr::mutate(value_Cherry_mean = mean(ifelse(Receptor == "Cherry", norm_Total, NA), na.rm = TRUE), + norm_Cherry = norm_Total/value_Cherry_mean) %>% + group_by(Position, Cond, Receptor, strand) %>% + dplyr::mutate(norm_mean = mean(norm_Cherry, na.rm = TRUE), + norm_min = min(norm_Cherry, na.rm = TRUE), + norm_max = max(norm_Cherry, na.rm = TRUE)) %>% + ungroup() %>% + select(-Replicate, -value_Total_mean, -value, -value_Cherry_mean, -norm_Cherry) %>% + distinct() + +##### plot coverage +colors <- c("#f38400", "#008856", "#be0032", "#a1caf1", "#f3c300", + "#875692", "#c2b280", "#848482", "#e68fac", "#0067a5", + "#f99379", "#604e97", "#f6a600", "#b3446c", "#dcd300", + "#882d17", "#8db600", "#654522", "#e25822", "#2b3d26") + + +g1 <- ggplot(data = df, aes(x = Position, y = norm_mean)) + + geom_smooth(method='loess', span=0.02, se=FALSE, na.rm=TRUE, alpha=0.4, size=0.5, aes(color=Receptor)) + + geom_ribbon(alpha = 0.4, aes(fill=Receptor, ymin=norm_min, ymax = norm_max)) + + scale_color_manual(values=colors) + + scale_fill_manual( values=colors ) + + ylab("RLR binding (FC)") + + xlab(" ") + ylim(0,11) + + # ggtitle("RLR binding along viral genome") + + theme_bw()+ + facet_wrap(~strand, ncol = 1) + +g1 + +ggsave("Coverage_only.png", width = 10, height = 5) + + +###ADD annotation +my_columns <- c("seqid", "start", "end", "strand", "type") +my_filter <- list(type="gene") +my_tags=c("seqid", "start", "end", "strand", "type") +genes <- readGFF(gff_file, version=0, + columns=my_columns, tags=NULL, filter=my_filter, nrows=-1) + + + + +g2 <- ggplot(genes, aes(xmin = start, xmax = end, y = strand, fill = Name, label=Name)) + + geom_gene_arrow() + + geom_gene_label(align = "left") + + xlab("Position in basepair") + + ylab("Annotation ") + + scale_fill_brewer(palette = "Set3") + + theme_genes() + + theme(legend.position = "none") + + +g2 + +plot1 <- ggplot2::ggplotGrob(g1) +plot2 <- ggplot2::ggplotGrob(g2) +maxWidth <- grid::unit.pmax(plot1$widths, plot2$widths) +plot1$widths <- as.list(maxWidth) +plot2$widths <- as.list(maxWidth) + + +pdf("Coverage_ribbon.pdf") +plot <- gridExtra::grid.arrange(plot1, plot2, ncol=1, heights = c(12, 5)) +dev.off()