diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index 8591dc509a53c262b6bc811b3e14766ffdd83ebd..3ac81ecd877a1863f5c704f91dc09a257d9d5d1b 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -136,7 +136,7 @@ from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_dis from libworkflows import texscape, ensure_relative, cleanup_and_backup from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS -from libworkflows import filter_combinator, sum_feature_counts, sum_htseq_counts, warn_context +from libworkflows import sum_feature_counts, sum_htseq_counts, warn_context from smincludes import rules as irules strip = str.strip @@ -348,8 +348,6 @@ def sum_te_counts(fname): return int(result.strip().split()[0]) -filtered_product = filter_combinator(product, forbidden) - # Limit risks of ambiguity by imposing replicates to be numbers # and restricting possible forms of some other wildcards wildcard_constraints: @@ -384,23 +382,14 @@ shell.prefix(SHELL_FUNCTIONS) bigwig_files = [ # individual libraries expand( - expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", - "{lib}_{rep}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), - filtered_product, lib=LIBS, rep=REPS), - read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), - #expand( - # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}.bw" % genome), - # filtered_product, lib=LIBS, rep=REPS), - # read_type=READ_TYPES_FOR_MAPPING, orientation=ORIENTATIONS), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), + lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), # means of replicates expand( - expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", - "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), - filtered_product, lib=LIBS), - read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), + lib=LIBS, read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), ] meta_profiles = [ @@ -436,38 +425,26 @@ meta_profiles = [ read_graphs = [ expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_base_composition_from_{position}.{fig_format}"), - read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"], fig_format=FIG_FORMATS), - filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.{fig_format}"), + lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"], fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_base_logo_from_{position}.{fig_format}"), - read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"], fig_format=FIG_FORMATS), - filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.{fig_format}"), + lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"], fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_composition.{fig_format}"), - read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS, fig_format=FIG_FORMATS), - filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.{fig_format}"), + lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS, fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_logo.{fig_format}"), - read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS, fig_format=FIG_FORMATS), - filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.{fig_format}"), + lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS, fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_size_distribution.{fig_format}"), - read_type=["trimmed", "nomap"], fig_format=FIG_FORMATS), - filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_size_distribution.{fig_format}"), + lib=LIBS, rep=REPS, read_type=["trimmed", "nomap"], fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", f"{size_selected}_smallRNA_barchart.{{fig_format}}"), - fig_format=FIG_FORMATS), filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.{{fig_format}}"), + lib=LIBS, rep=REPS, fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "nb_reads.{fig_format}"), - fig_format=FIG_FORMATS), filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.{fig_format}"), + lib=LIBS, rep=REPS, fig_format=FIG_FORMATS), ] if contrasts_dict["ip"]: @@ -527,25 +504,31 @@ rule all: input: expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), - filtered_product, lib=LIBS, rep=REPS, read_type=[size_selected]), + lib=LIBS, rep=REPS, read_type=[size_selected]), expand( OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), - filtered_product, lib=LIBS, rep=REPS, read_type=[size_selected]), + lib=LIBS, rep=REPS, read_type=[size_selected]), expand( OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.{fig_format}"), - filtered_product, lib=LIBS, rep=REPS, fig_format=FIG_FORMATS), + lib=LIBS, rep=REPS, fig_format=FIG_FORMATS), expand( - expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", - "{{lib}}_{{rep}}_%s_on_%s_{orientation}_counts.txt" % (size_selected, genome)), - orientation=ORIENTATIONS), - filtered_product, lib=LIBS, rep=REPS), + OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome), + lib=LIBS, rep=REPS, read_type=["RPF"], orientation=ORIENTATIONS), + # TODO: adapt from RNA-seq + # expand( + # OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"), + # lib=LIBS, rep=REPS), + # expand( + # expand( + # OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", + # f"all_on_{genome}", "{biotype}_{orientation}_TPM.txt"), + # biotype=["alltypes"], orientation=ORIENTATIONS), bigwig_files, # rule future_all: # meta_profiles, # read_graphs, -# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", "{lib}_{rep}_nb_non_structural.txt"), filtered_product, lib=LIBS, rep=REPS), # OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), # expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), # #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES), @@ -1056,26 +1039,6 @@ rule gather_read_counts_summaries: summaries.to_csv(output.summary_table, sep="\t") -# Moved to script compute_genes_exon_lengths.py -#rule compute_genes_exon_lengths: -# """To each gene ID, associate a length obtained by taking the union of all exons belonging to transcripts of this gene.""" -# input: -# genes_gtf = OPJ(annot_dir, "genes.gtf"), -# dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed"), -# rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed"), -# satel_bed = OPJ(annot_dir, "satellites_rmsk.bed"), -# simrep_bed = OPJ(annot_dir, "simple_repeats_rmsk.bed"), -# -# output: -# exon_lengths = OPJ(annot_dir, "union_exon_lengths.txt"), -# run: -# pd.concat(( -# gtf_2_genes_exon_lengths(input.genes_gtf), -# repeat_bed_2_lengths(input.dte_bed), -# repeat_bed_2_lengths(input.rte_bed), -# repeat_bed_2_lengths(input.satel_bed), -# repeat_bed_2_lengths(input.simrep_bed))).to_csv(output.exon_lengths, sep="\t") - # TODO: use total RPF instead of non_structural to normalize rule compute_RPM: input: