diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index e17f7b890b713d7ce01e5ebbe506bb868cd045fa..6bf16cfc18f7a1da15e1ac1a22ed192d373fb0ce 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -74,7 +74,7 @@ from subprocess import Popen, PIPE, CalledProcessError from itertools import chain, combinations, product, repeat, starmap from functools import reduce from operator import or_ as union -from cytoolz import concat, merge_with, take_nth +from cytoolz import concat, merge_with, take_nth, valmap def concat_lists(lists): return list(concat(lists)) @@ -98,7 +98,6 @@ warnings.formatwarning = formatwarning # from gatb import Bank from mappy import fastx_read # To parse SAM format -import pysam import pyBigWig # To compute correlation coefficient @@ -142,7 +141,7 @@ from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_dis from libworkflows import texscape, ensure_relative, cleanup_and_backup from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS -from libworkflows import filter_combinator, sum_feature_counts, sum_htseq_counts, warn_context +from libworkflows import filter_combinator, read_feature_counts, sum_feature_counts, sum_htseq_counts, warn_context from smincludes import rules as irules strip = str.strip @@ -264,16 +263,19 @@ size_selected = "%s-%s" % (MIN_LEN, MAX_LEN) # 24 45 # 25 11 # 26 3 +# This should ideally come from genome configuration: MI_MAX = 26 read_type_max_len = { size_selected: int(MAX_LEN), "pi": min(PI_MAX, int(MAX_LEN)), "si": min(SI_MAX, int(MAX_LEN)), "mi": min(MI_MAX, int(MAX_LEN))} -#READ_TYPES_FOR_COMPOSITION=[size_selected, "nomap_siRNA", "prot_siRNA", "te_siRNA", "pseu_siRNA", "piRNA", "miRNA", "prot_siuRNA", "te_siuRNA", "pseu_siuRNA", "all_siRNA"] -READ_TYPES_FOR_COMPOSITION=[size_selected, "nomap_siRNA", "all_siRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), SMALL_TYPES)) -READ_TYPES_FOR_MAPPING=[size_selected, "nomap_siRNA", "all_siRNA", "siRNA", "siuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), SMALL_TYPES)) -#READ_TYPES_FOR_MAPPING=[size_selected, "all_siRNA"] +READ_TYPES_FOR_COMPOSITION = [ + size_selected, "nomap_siRNA", "all_siRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), SMALL_TYPES)) +READ_TYPES_FOR_MAPPING = [ + size_selected, "nomap_siRNA", "prot_sisiuRNA", + "all_siRNA", "all_siuRNA", "all_sisiuRNA", + "siRNA", "siuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), SMALL_TYPES)) # Types of annotation features, as defined in the "gene_biotype" # GTF attribute sections of the annotation files. COUNT_BIOTYPES = config["count_biotypes"] @@ -306,8 +308,6 @@ BOXPLOT_GENE_LISTS = [ "piRNA_dependent_prot_si_down4", "csr1_prot_si_supertargets_48hph", "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"] -# temporary -#BIOTYPES = ["pseudogene"] aligner = config["aligner"] ######################## # Genome configuration # @@ -315,18 +315,19 @@ aligner = config["aligner"] genome_dict = config["genome_dict"] genome = genome_dict["name"] chrom_sizes = get_chrom_sizes(genome_dict["size"]) +chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {}))) genomelen = sum(chrom_sizes.values()) genome_db = genome_dict["db"][aligner] # bed file binning the genome in 10nt bins genome_binned = genome_dict["binned"] annot_dir = genome_dict["annot_dir"] +exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] convert_dir = genome_dict["convert_dir"] gene_lists_dir = genome_dict["gene_lists_dir"] -exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) index = genome_db -#convert_dir = config["convert_dir"] + output_dir = config["output_dir"] local_annot_dir = config.get("local_annot_dir", OPJ(output_dir, "annotations")) log_dir = config.get("log_dir", OPJ(output_dir, "logs")) @@ -334,6 +335,14 @@ data_dir = config.get("data_dir", OPJ(output_dir, "data")) # To put the results of small_RNA_seq_annotate reads_dir = OPJ(output_dir, aligner, f"mapped_{genome}", "reads") counts_dir = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation") +READ_TYPES_FOR_COUNTING = [size_selected, "all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"])) +# Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting +assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped." +# Reads remapped and counted using featureCounts +REMAPPED_COUNTED = [ + f"{small_type}_on_{genome}_{biotype}_{orientation}_transcript" for ( + small_type, biotype, orientation) in product( + READ_TYPES_FOR_COUNTING, COUNT_BIOTYPES, ORIENTATIONS)] # Used to skip some genotype x treatment x replicate number combinations # when some of them were not sequenced forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]} @@ -444,17 +453,18 @@ wildcard_constraints: min_dist="\d+", min_len="\d+", #max_len="\d+", - biotype="|".join(COUNT_BIOTYPES + ANNOT_BIOTYPES), + biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)), id_list="|".join(GENE_LISTS), type_set="|".join(["all", "protein_coding", "protein_coding_TE"]), - small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu", - read_type="|".join([ - "raw", "trimmed", "deduped", f"{size_selected}", - "nomap", "nomap_siRNA", "mapped", - "siRNA", "piRNA", "miRNA", "siuRNA", - "prot_siRNA", "te_siRNA", "pseu_siRNA", "satel_siRNA", "simrep_siRNA", - "prot_siuRNA", "te_siuRNA", "pseu_siuRNA", "satel_siuRNA", "simrep_siuRNA", - "all_siRNA", "all_siuRNA"]), + small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu|" + "|".join(REMAPPED_COUNTED), + # read_type="|".join([ + # "raw", "trimmed", "deduped", f"{size_selected}", + # "nomap", "nomap_siRNA", "mapped", + # "siRNA", "piRNA", "miRNA", "siuRNA", + # "prot_siRNA", "te_siRNA", "pseu_siRNA", "satel_siRNA", "simrep_siRNA", + # "prot_siuRNA", "te_siuRNA", "pseu_siuRNA", "satel_siuRNA", "simrep_siuRNA", + # "prot_sisiuRNA", "te_sisiuRNA", "pseu_sisiuRNA", "satel_sisiuRNA", "simrep_sisiuRNA", + # "all_siRNA", "all_siuRNA", "all_sisiuRNA"] + REMAPPED_COUNTED), standard="zscore|robust|minmax|unit|identity", orientation="all|fwd|rev", contrast="|".join(CONTRASTS), @@ -569,23 +579,26 @@ read_graphs = [ fig_format=FIG_FORMATS), filtered_product, lib=LIBS, rep=REPS), ] +ip_fold_heatmaps = [] +de_fold_heatmaps = [] +ip_fold_boxplots = [] if contrasts_dict["ip"]: - fold_heatmaps = expand( + ip_fold_heatmaps = expand( OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"), - small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"], fig_format=FIG_FORMATS) + small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"], fig_format=FIG_FORMATS) ip_fold_boxplots = expand( OPJ(output_dir, "figures", "all_{contrast_type}", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.{fig_format}"), contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], gene_list=BOXPLOT_GENE_LISTS, fig_format=FIG_FORMATS) -else: - fold_heatmaps = expand( +if contrasts_dict["de"]: + de_fold_heatmaps = expand( OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"), small_type=["pisimi", "prot_si"], fold_type=["log2FoldChange"], fig_format=FIG_FORMATS) - ip_fold_boxplots = [] exploratory_graphs = [ - fold_heatmaps, + ip_fold_heatmaps, + de_fold_heatmaps, # Large figures, not very readable # expand( # OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_by_{norm}_pairplots.{fig_format}"), @@ -629,6 +642,29 @@ rule all: filtered_product, lib=LIBS, rep=REPS, read_type=[size_selected]), bigwig_files, meta_profiles, + # snakemake -n OK + # expand( + # expand( + # OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + # "{{lib}}_{{rep}}_{read_type}_on_%s_{orientation}_transcript_counts.txt" % (genome)), + # read_type=READ_TYPES_FOR_MAPPING, orientation=ORIENTATIONS), + # filtered_product, lib=LIBS, rep=REPS), + # TODO + # snakemake -n OK + # expand( + # expand( + # OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", + # "{{lib}}_{{rep}}_{small_type}_counts.txt"), + # small_type=REMAPPED_COUNTED), + # filtered_product, lib=LIBS, rep=REPS), + # snakemake -n not OK: summaries contains all biotypes + # expand( + # expand( + # OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + # "{{lib}}_{{rep}}_{small_type}_counts.txt"), + # small_type=REMAPPED_COUNTED), + # filtered_product, lib=LIBS, rep=REPS), + # REMAPPED_COUNTED = [f"{small_type}_on_{genome}_{biotype}_{orientation}_transcript" for (small_type, biotype, orientation) in product(READ_TYPES_FOR_MAPPING, set(COUNT_BIOTYPES + ANNOT_BIOTYPES), ORIENTATIONS)] expand( expand( OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", @@ -649,11 +685,27 @@ rule all: exploratory_graphs, fold_boxplots, #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS), - # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting - # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot + # snakemake -n OK + expand(OPJ( + output_dir, aligner, f"mapped_{genome}", "feature_count", + "all_{small_type}_counts.txt"), small_type=REMAPPED_COUNTED), + #expand(OPJ( + # output_dir, aligner, f"mapped_{genome}", "feature_count", + # "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS), + # expand(OPJ( + # output_dir, aligner, f"mapped_{genome}", "feature_count", + # "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), + # small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS), + # snakemake -n not OK (Missing input files for rule gather_small_RNA_counts) + #expand( + # OPJ(counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), + # small_type=REMAPPED_COUNTED), + ## expand( OPJ(counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), + # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting + # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot expand( OPJ(output_dir, "figures", "{small_type}_norm_correlations.{fig_format}"), small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"], fig_format=FIG_FORMATS), @@ -769,8 +821,24 @@ def source_fastq(wildcards): return rules.small_RNA_seq_annotate.output.satel_siu elif read_type == "simrep_siuRNA": return rules.small_RNA_seq_annotate.output.simrep_siu + elif read_type == "all_siuRNA": + return rules.small_RNA_seq_annotate.output.all_siu elif read_type == "siuRNA": return rules.join_si_reads.output.siu + elif read_type == "sisiuRNA": + return rules.join_si_reads.output.sisiu + elif read_type == "prot_sisiuRNA": + return rules.join_si_reads.output.prot_sisiu + elif read_type == "te_sisiuRNA": + return rules.join_si_reads.output.te_sisiu + elif read_type == "pseu_sisiuRNA": + return rules.join_si_reads.output.pseu_sisiu + elif read_type == "satel_sisiuRNA": + return rules.join_si_reads.output.satel_sisiu + elif read_type == "simrep_sisiuRNA": + return rules.join_si_reads.output.simrep_sisiu + elif read_type == "all_sisiuRNA": + return rules.join_si_reads.output.all_sisiu else: raise NotImplementedError("Unknown read type: %s" % read_type) @@ -834,9 +902,11 @@ rule remap_on_genome: wildcard_constraints: read_type = "|".join([ "nomap", "nomap_siRNA", "mapped", - "siRNA", "piRNA", "miRNA", "siuRNA", + "siRNA", "piRNA", "miRNA", "siuRNA", "sisiuRNA", "prot_siRNA", "te_siRNA", "pseu_siRNA", "satel_siRNA", "simrep_siRNA", - "prot_siuRNA", "te_siuRNA", "pseu_siuRNA", "satel_siuRNA", "simrep_siuRNA", "all_siRNA"]) + "prot_siuRNA", "te_siuRNA", "pseu_siuRNA", "satel_siuRNA", "simrep_siuRNA", + "prot_sisiuRNA", "te_sisiuRNA", "pseu_sisiuRNA", "satel_sisiuRNA", "simrep_sisiuRNA", + "all_siRNA", "all_siuRNA", "all_sisiuRNA"]) params: aligner = aligner, index = genome_db, @@ -914,7 +984,7 @@ def feature_orientation2stranded(wildcards): def source_bams(wildcards): """We can count from several bam files with feature_count.""" - read_types = wildcards.read_types.split("_and_") + read_types = wildcards.read_type.split("_and_") sorted_bams = [OPJ( output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", f"{read_type}_on_{genome}_sorted.bam") for read_type in read_types] @@ -927,20 +997,20 @@ rule feature_count_reads: output: counts = OPJ( output_dir, aligner, f"mapped_{genome}", "feature_count", - "{lib}_{rep}_{read_types}_on_%s_{biotype}_{orientation}_{feature_type}_counts.txt" % genome), + "{lib}_{rep}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_counts.txt" % genome), params: stranded = feature_orientation2stranded, annot = lambda wildcards: OPJ(annot_dir, f"{wildcards.biotype}.gtf") message: - "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.feature_type} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_types} with featureCounts." + "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.feature_type} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} with featureCounts." benchmark: - OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_types}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), + OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_{feature_type}_benchmark.txt"), log: - log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_types}_{biotype}_{orientation}_{feature_type}.log"), - err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_types}_{biotype}_{orientation}_{feature_type}.err"), + log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_{feature_type}.log"), + err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_{feature_type}.err"), shell: """ - tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_types}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}.XXXXXXXXXX") + tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}.XXXXXXXXXX") cmd="featureCounts -a {params.annot} -o {output.counts} -t {wildcards.feature_type} -g "gene_id" -O -s {params.stranded} --fracOverlap 1 --tmpDir ${{tmpdir}} {input}" featureCounts -v 2> {log.log} echo ${{cmd}} 1>> {log.log} @@ -954,19 +1024,60 @@ rule summarize_feature_counts: input: expand( OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", - "{{lib}}_{{rep}}_{{read_types}}_on_%s_{biotype}_{{orientation}}_{{feature_type}}_counts.txt" % genome), + "{{lib}}_{{rep}}_{{read_type}}_on_%s_{biotype}_{{orientation}}_{{feature_type}}_counts.txt" % genome), biotype=COUNT_BIOTYPES), output: summary = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", - "{lib}_{rep}_{read_types}_on_%s_{orientation}_{feature_type}_counts.txt" % genome) + "{lib}_{rep}_{read_type}_on_%s_{orientation}_{feature_type}_counts.txt" % genome) run: with open(output.summary, "w") as summary_file: header = "\t".join(COUNT_BIOTYPES) summary_file.write("#biotypes\t%s\n" % header) - sums = "\t".join((str(sum_feature_counts(counts_file, nb_bams=len(wildcards.read_types.split("_and_")))) for counts_file in input)) + sums = "\t".join((str(sum_feature_counts(counts_file, nb_bams=len(wildcards.read_type.split("_and_")))) for counts_file in input)) summary_file.write(f"%s_%s_%s\t%s\n" % (wildcards.lib, wildcards.rep, wildcards.orientation, sums)) +rule gather_feature_counts: + """For a given biotype, gather counts from all libraries in one table.""" + input: + counts_tables = expand( + OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", + "{lib}_{rep}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome), + filtered_product, lib=LIBS, rep=REPS), + output: + counts_table = OPJ( + output_dir, aligner, f"mapped_{genome}", "feature_count", + "all_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_counts.txt" % genome), + wildcard_constraints: + biotype = "|".join(COUNT_BIOTYPES) + run: + # Gathering the counts data + ############################ + counts_files = (OPJ( + output_dir, + aligner, + f"mapped_{genome}", + "feature_count", + f"{cond_name}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_{wildcards.feature_type}_counts.txt") for cond_name in COND_NAMES) + counts_data = pd.concat( + (read_feature_counts( + counts_file, + nb_bams=len(wildcards.read_type.split("_and_"))) for counts_file in counts_files), + axis=1).fillna(0).astype(int) + counts_data.columns = COND_NAMES + # Simple_repeat|Simple_repeat|(TTTTTTG)n:1 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:2 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:3 + # Simple_repeat|Simple_repeat|(TTTTTTG)n:4 + # -> Simple_repeat|Simple_repeat|(TTTTTTG)n + if wildcards.biotype.endswith("_rmsk_families"): + repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index] + # Sum the counts for a given repeat family + counts_data = counts_data.assign(family=repeat_families).groupby("family").sum() + counts_data.index.names = ["gene"] + counts_data.to_csv(output.counts_table, sep="\t") + + rule gather_annotations: input: expand(OPJ(annot_dir, "{biotype}.gtf"), biotype=ANNOT_BIOTYPES), @@ -1050,6 +1161,7 @@ rule small_RNA_seq_annotate: pseu_siu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "pseu_siuRNA.fastq.gz"), satel_siu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "satel_siuRNA.fastq.gz"), simrep_siu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "simrep_siuRNA.fastq.gz"), + all_siu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "all_siuRNA.fastq.gz"), # stats ######## all_annots = OPJ(counts_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "annot_stats.txt"), @@ -1142,24 +1254,38 @@ rule plot_read_numbers: rule join_si_reads: input: prot_si = rules.small_RNA_seq_annotate.output.prot_si, - te_si = rules.small_RNA_seq_annotate.output.prot_si, - pseu_si = rules.small_RNA_seq_annotate.output.pseu_si, - satel_si = rules.small_RNA_seq_annotate.output.satel_si, - simrep_si = rules.small_RNA_seq_annotate.output.simrep_si, prot_siu = rules.small_RNA_seq_annotate.output.prot_siu, + te_si = rules.small_RNA_seq_annotate.output.prot_si, te_siu = rules.small_RNA_seq_annotate.output.prot_siu, + pseu_si = rules.small_RNA_seq_annotate.output.pseu_si, pseu_siu = rules.small_RNA_seq_annotate.output.pseu_siu, + satel_si = rules.small_RNA_seq_annotate.output.satel_si, satel_siu = rules.small_RNA_seq_annotate.output.satel_siu, + simrep_si = rules.small_RNA_seq_annotate.output.simrep_si, simrep_siu = rules.small_RNA_seq_annotate.output.simrep_siu, + all_si = rules.small_RNA_seq_annotate.output.all_si, + all_siu = rules.small_RNA_seq_annotate.output.all_siu, output: si = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "siRNA.fastq.gz"), siu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "siuRNA.fastq.gz"), sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "sisiuRNA.fastq.gz"), + prot_sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "prot_sisiuRNA.fastq.gz"), + te_sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "te_sisiuRNA.fastq.gz"), + pseu_sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "pseu_sisiuRNA.fastq.gz"), + satel_sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "satel_sisiuRNA.fastq.gz"), + simrep_sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "simrep_sisiuRNA.fastq.gz"), + all_sisiu = OPJ(reads_dir, "{lib}_{rep}_%s_on_%s" % (size_selected, genome), "all_sisiuRNA.fastq.gz"), shell: """ cat {input.prot_si} {input.te_si} {input.pseu_si} {input.satel_si} {input.simrep_si} > {output.si} cat {input.prot_siu} {input.te_siu} {input.pseu_siu} {input.satel_siu} {input.simrep_siu} > {output.siu} cat {output.si} {output.siu} > {output.sisiu} + cat {input.prot_si} {input.prot_siu} > {output.prot_sisiu} + cat {input.te_si} {input.te_siu} > {output.te_sisiu} + cat {input.pseu_si} {input.pseu_siu} > {output.pseu_sisiu} + cat {input.satel_si} {input.satel_siu} > {output.satel_sisiu} + cat {input.simrep_si} {input.simrep_siu} > {output.simrep_sisiu} + cat {input.all_si} {input.all_siu} > {output.all_sisiu} """ @@ -1247,7 +1373,8 @@ rule join_si_counts: counts_dir, f"all_{size_selected}_on_{genome}", "si_counts.txt"), run: - counts_data = pd.concat((pd.read_table(table, index_col="gene") for table in input.counts_tables)) + counts_data = pd.concat(( + pd.read_table(table, index_col="gene") for table in input.counts_tables)) counts_data.index.names = ["gene"] counts_data.to_csv(output.counts_table, sep="\t") @@ -1287,7 +1414,7 @@ rule join_sisiu_counts: rule join_all_sisiu_counts: - """sum si and siu into sisiu""" + """sum all_si and all_siu into all_sisiu""" input: all_si_counts_table = OPJ( counts_dir, @@ -1332,6 +1459,12 @@ rule join_pisimi_counts: def source_small_RNA_counts(wildcards): """Determines from which rule the gathered small counts should be sourced.""" + # TODO: add {small_type}_on_{genome}_{biotype}_{orientation}_{feature_type} category to use the featurecounts results + if hasattr(wildcards, "biotype"): + return rules.gather_feature_counts.output.counts_table + #return OPJ( + # output_dir, aligner, f"mapped_{genome}", "feature_count", + # f"all_{wildcards.remapped_counted}_{wildcards.feature_type}_counts.txt") if wildcards.small_type == "pisimi": # all_si and also pi and mi return rules.join_pisimi_counts.output.counts_table @@ -1439,42 +1572,25 @@ rule make_read_counts_summary: #threads: 8 # to limit memory usage, actually run: with open(output.summary, "w") as summary_file: + # TODO: add all_sisiuRNA summary_file.write("%s\n" % "\t".join([ "raw", "trimmed", "deduped", "%s" % size_selected, "nomap_siRNA", "mapped", "non_structural", "ambig_type", "prot_sisiuRNA", "te_sisiuRNA", "pseu_sisiuRNA", "satel_sisiuRNA", "simrep_sisiuRNA", "siRNA", "piRNA", "miRNA"])) - # TODO: use read_int_from_file - #with open(input.nb_raw) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_raw)) - #with open(input.nb_trimmed) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_trimmed)) - #with open(input.nb_deduped) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_deduped)) - #with open(input.nb_size_selected) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_size_selected)) - #with open(input.nb_nomap_si) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_nomap_si)) - #with open(input.nb_mapped) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_mapped)) - #with open(input.nb_non_structural) as counts_file: - # summary_file.write(counts_file.readline().strip()) summary_file.write("%d\t" % read_int_from_file(input.nb_non_structural)) summary_file.write(str(file_len(input.ambig_type) + file_len(input.ambig_typeU))) summary_file.write("\t") - #summary_file.write(str(file_len(input.si) // 4)) summary_file.write(str(sum_counts(input.prot_si_counts) + sum_counts(input.prot_siu_counts))) summary_file.write("\t") - #summary_file.write(str(file_len(input.si) // 4)) summary_file.write(str(sum_counts(input.te_si_counts) + sum_counts(input.te_siu_counts))) summary_file.write("\t") - #summary_file.write(str(file_len(input.si) // 4)) summary_file.write(str(sum_counts(input.pseu_si_counts) + sum_counts(input.pseu_siu_counts))) summary_file.write("\t") summary_file.write(str(sum_counts(input.satel_si_counts) + sum_counts(input.satel_siu_counts))) @@ -1497,10 +1613,8 @@ rule make_read_counts_summary: raise IOError(erru) summary_file.write(str(int(result.strip().split()[0]) + int(resultu.strip().split()[0]))) summary_file.write("\t") - #summary_file.write(str(file_len(input.pi) // 4)) summary_file.write(str(sum_counts(input.pi_counts))) summary_file.write("\t") - #summary_file.write(str(file_len(input.mi) // 4)) summary_file.write(str(sum_counts(input.mi_counts))) summary_file.write("\n") @@ -1732,6 +1846,15 @@ def plot_fold_heatmap(data, gene_colours=None, labels_dict=None): #class DefaultCounter(defaultdict, Counter): # pass +# TODO: Check if errors due to bad data can be caught: dendrogram excessive recursion and +# TypeError in line 1750 of /pasteur/homes/bli/src/bioinfo_utils/small_RNA-seq/small_RNA-seq.snakefile: +# unsupported operand type(s) for -: 'str' and 'int' +# File "/pasteur/homes/bli/src/bioinfo_utils/small_RNA-seq/small_RNA-seq.snakefile", line 1750, in __rule_make_fold_heatmap +# File "/home/bli/.local/lib/python3.6/site-packages/pandas/io/parsers.py", line 709, in parser_f +# File "/home/bli/.local/lib/python3.6/site-packages/pandas/io/parsers.py", line 455, in _read +# File "/home/bli/.local/lib/python3.6/site-packages/pandas/io/parsers.py", line 1069, in read +# File "/home/bli/.local/lib/python3.6/site-packages/pandas/io/parsers.py", line 1846, in read +# File "/home/bli/.local/lib/python3.6/site-packages/pandas/io/parsers.py", line 3221, in _get_empty_meta rule make_fold_heatmap: """With a full list of genes, this rule can use a lot of memory.""" input: