diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile index b9e2e9d068b85a574dddfb73049a83ef64bd1ee4..e05d71b33d24086e79043419772784aeaf77a25b 100644 --- a/CLIP/iCLIP.snakefile +++ b/CLIP/iCLIP.snakefile @@ -54,9 +54,11 @@ merged_fastq = config["merged_fastq"] barcode_dict = config["barcode_dict"] BARCODES = list(barcode_dict.keys()) MAX_DIFF = config["max_diff"] -output_dir = config["output_dir"] -log_dir = OPJ(output_dir, "logs") -data_dir = OPJ(output_dir, "data") +#output_dir = config["output_dir"] +#workdir: config["output_dir"] +output_dir = os.path.abspath(".") +log_dir = OPJ("logs") +data_dir = OPJ("data") demux_dir = OPJ(data_dir, f"demultiplexed_{MAX_DIFF}") lib2raw = defaultdict(dict) REPS = set() @@ -144,21 +146,21 @@ preprocessing = [ mapping = [ ## Will be pulled in as dependencies of other needed results: - # expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED), + # expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED), ## expand( - OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), + OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED + [f"{to_map}_unmapped" for to_map in POST_TRIMMING + SIZE_SELECTED]), ] counting = [ ## Will be pulled in as dependencies of other needed results: - # expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), + # expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), ## - expand(OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS), - expand(OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), - expand(OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm=NORM_TYPES, orientation=["all"]), + expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS), + expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), + expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm=NORM_TYPES, orientation=["all"]), ] #TODO: @@ -407,8 +409,8 @@ rule map_on_genome: fastq = source_fastq, output: # sam files take a lot of space - sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), + sam = temp(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome)), + nomap_fastq = OPJ("{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), wildcard_constraints: read_type = "|".join(POST_TRIMMING + SIZE_SELECTED) params: @@ -430,11 +432,11 @@ rule remap_on_genome: input: # fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"), #fastq = rules.map_on_genome.output.nomap_fastq, - fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), + fastq = OPJ("{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), output: # sam files take a lot of space - sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_unmapped_on_%s.fastq.gz" % genome), + sam = temp(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_on_%s.sam" % genome)), + nomap_fastq = OPJ("{trimmer}", aligner, "info_mapping_%s" % genome, "{lib}_{rep}_{read_type}_unmapped_unmapped_on_%s.fastq.gz" % genome), wildcard_constraints: read_type = "|".join(POST_TRIMMING + SIZE_SELECTED) #wildcard_constraints: @@ -456,10 +458,10 @@ rule remap_on_genome: rule sam2indexedbam: input: - sam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome), + sam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s.sam" % genome), output: - sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), - index = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome), + sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), + index = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome), message: "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." log: @@ -475,7 +477,7 @@ rule compute_mapping_stats: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, output: - stats = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), + stats = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_samtools_stats.txt" % genome), shell: """samtools stats {input.sorted_bam} > {output.stats}""" @@ -484,11 +486,11 @@ rule fuse_bams: """This rule fuses the two sorted bam files corresponding to the mapping of the reads containing the adaptor or not.""" input: - noadapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome), - adapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_on_%s_sorted.bam" % genome), + noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_noadapt_on_%s_sorted.bam" % genome), + adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_adapt_on_%s_sorted.bam" % genome), output: - sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s_sorted.bam" % genome), - bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_C_%s_sorted.bam.bai" % genome), + sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s_sorted.bam" % genome), + bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_on_C_%s_sorted.bam.bai" % genome), message: "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}" log: @@ -520,11 +522,11 @@ def biotype2annot(wildcards): rule feature_count_reads: input: - sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), - bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome), + sorted_bam = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam" % genome), + bai = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_sorted.bam.bai" % genome), output: - counts = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), - counts_converted = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts_gene_names.txt"), + counts = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), + counts_converted = OPJ("{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts_gene_names.txt"), params: stranded = feature_orientation2stranded(LIB_TYPE), annot = biotype2annot, @@ -549,9 +551,9 @@ rule feature_count_reads: rule summarize_feature_counts: """For a given library, compute the total counts for each biotype and write this in a summary table.""" input: - biotype_counts_files = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{{lib}}_{{rep}}_{{read_type}}_on_%s" % genome, "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), + biotype_counts_files = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{{lib}}_{{rep}}_{{read_type}}_on_%s" % genome, "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), output: - summary = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome), + summary = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome), run: sum_counter = sum_feature_counts with open(output.summary, "w") as summary_file: @@ -563,12 +565,11 @@ rule summarize_feature_counts: rule gather_read_counts_summaries: input: - summary_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS), + summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS), output: - summary_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), + summary_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), run: summary_files = (OPJ( - output_dir, wildcards.trimmer, aligner, "mapped_%s" % genome, @@ -585,9 +586,9 @@ rule gather_read_counts_summaries: rule gather_counts: """For a given biotype, gather counts from all libraries in one table.""" input: - counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS), + counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS), output: - counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), + counts_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), # wildcard_constraints: # # Avoid ambiguity with join_all_counts # biotype = "|".join(COUNT_BIOTYPES) @@ -595,7 +596,6 @@ rule gather_counts: # Gathering the counts data ############################ counts_files = (OPJ( - output_dir, wildcards.trimmer, aligner, "mapped_%s" % genome, @@ -635,7 +635,7 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: input: counts_table = rules.gather_counts.output.counts_table, output: - median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), + median_ratios_file = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), run: counts_data = pd.read_table( input.counts_table, @@ -652,7 +652,7 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: def source_norm_file(wildcards): if wildcards.norm == "median_ratio_to_pseudo_ref": - return OPJ(output_dir, f"{wildcards.trimmer}", aligner, f"mapped_{genome}", "feature_count", f"all_{wildcards.read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt") + return OPJ(f"{wildcards.trimmer}", aligner, f"mapped_{genome}", "feature_count", f"all_{wildcards.read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt") else: return rules.summarize_feature_counts.output.summary @@ -664,11 +664,11 @@ rule make_normalized_bigwig: # TODO: use sourcing function based on norm norm_file = source_norm_file, #size_factor_file = rules.compute_coverage.output.coverage - #median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), + #median_ratios_file = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), # TODO: compute this - #scale_factor_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), + #scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), output: - bigwig_norm = OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), + bigwig_norm = OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), #params: # orient_filter = bamcoverage_filter, threads: 12 # to limit memory usage, actually diff --git a/Degradome-seq/Degradome-seq.snakefile b/Degradome-seq/Degradome-seq.snakefile index 93a76b440f7c0d7b30a86fc94a3204fddc53a8ec..ef159b9a71528445b054d5933e4e23f774fc9c90 100644 --- a/Degradome-seq/Degradome-seq.snakefile +++ b/Degradome-seq/Degradome-seq.snakefile @@ -83,12 +83,14 @@ annot_dir = genome_dict["annot_dir"] convert_dir = genome_dict["convert_dir"] gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) -output_dir = config["output_dir"] -log_dir = OPJ(output_dir, f"logs_{genome}") -data_dir = OPJ(output_dir, "data") +#output_dir = config["output_dir"] +#workdir: config["output_dir"] +output_dir = os.path.abspath(".") +log_dir = OPJ(f"logs_{genome}") +data_dir = OPJ("data") counter = "feature_count" -counts_dir = OPJ(output_dir, aligner, f"mapped_{genome}", counter) +counts_dir = OPJ(aligner, f"mapped_{genome}", counter) # Limit risks of ambiguity by imposing replicates to be numbers # and restricting possible forms of some other wildcards @@ -103,7 +105,7 @@ rule all: # OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"), # lib=LIBS, rep=REPS), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome), + # OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome), # lib=LIBS, rep=REPS), # expand( # OPJ(counts_dir, @@ -151,8 +153,8 @@ rule map_on_genome: fastq = rules.trim_reads.output.trimmed, output: # temp because it uses a lot of space - sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome), + sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome), params: aligner = aligner, index = genome_db, @@ -174,8 +176,8 @@ rule sam2indexedbam: input: sam = rules.map_on_genome.output.sam, output: - sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)), - index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)), + sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)), + index = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)), message: "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}." log: @@ -377,7 +379,7 @@ rule compute_RPK: input: counts_data = source_counts, #counts_data = rules.gather_counts.output.counts_table, - #counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", + #counts_data = OPJ(aligner, f"mapped_{genome}", "feature_count", # f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"), output: rpk_file = OPJ(counts_dir, diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 14273b7fcd97ae92db3035b7feaa17b4caa9c6ae..2d31613ecc60dfb8e80f4acdb434579ce3933c4d 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -171,21 +171,40 @@ ID_LISTS = [ "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014", "piRNA_dependent_prot_si_22G_down4_top200", "piRNA_dependent_prot_si_22G_down4", "csr1_prot_si_supertargets_common"] -annot_dir = config["annot_dir"] -gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists" +aligner = config["aligner"] +######################## +# Genome configuration # +######################## +genome_dict = config.get("genome_dict", None) +if genome_dict is not None: + genome = genome_dict["name"] + chrom_sizes = get_chrom_sizes(genome_dict["size"]) + genomelen = sum(chrom_sizes.values()) + genome_db = genome_dict["db"][aligner] + # bed file binning the genome in 10nt bins + genome_binned = genome_dict["binned"] + annot_dir = genome_dict["annot_dir"] + # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] + convert_dir = genome_dict["convert_dir"] + gene_lists_dir = genome_dict["gene_lists_dir"] +else: + genome = "C_elegans" + chrom_sizes = get_chrom_sizes(config["genome_size"]) + genomelen = sum(chrom_sizes.values()) + genome_db = config["index"] + genome_binned = f"/pasteur/entites/Mhe/Genomes/{genome}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed" + annot_dir = config["annot_dir"] + convert_dir = config["convert_dir"] + gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists" avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) -chrom_sizes = get_chrom_sizes(config["genome_size"]) -genomelen = sum(chrom_sizes.values()) -genome_binned = "/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed" #gene_lists_dir = config["gene_lists_dir"] #local_annot_dir = config["local_annot_dir"] -aligner = config["aligner"] -index = config["index"] -convert_dir = config["convert_dir"] -output_dir = config["output_dir"] -log_dir = OPJ(output_dir, "logs") -data_dir = OPJ(output_dir, "data") -local_annot_dir = OPJ(output_dir, "annotations") +#output_dir = config["output_dir"] +#workdir: config["output_dir"] +output_dir = os.path.abspath(".") +log_dir = OPJ("logs") +data_dir = OPJ("data") +local_annot_dir = OPJ("annotations") # Used to skip some genotype x treatment x replicate number combinations # when some of them were not sequenced forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]} @@ -293,7 +312,7 @@ if len(CONDITIONS) < 2: pca_figs = [] else: pca_figs = expand(OPJ( - output_dir, "{trimmer}", "figures", aligner, "{counter}", + "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), @@ -302,45 +321,45 @@ rule all: """This top rule is used to drive the whole workflow by taking as input its final products.""" input: expand(OPJ( - output_dir, "{trimmer}", "figures", aligner, "{counter}", + "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, id_list=ID_LISTS), expand(OPJ( - output_dir, "{trimmer}", "figures", aligner, "{counter}", + "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, fold_type=["log2FoldChange"]), expand(OPJ( - output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", + "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"), trimmer=TRIMMERS, counter=COUNTERS, contrast=CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES), pca_figs, #expand(OPJ( - # output_dir, "{trimmer}", "figures", aligner, "{counter}", + # "{trimmer}", "figures", aligner, "{counter}", # "{biotype}_{orientation}_PCA.pdf"), # trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, # orientation=ORIENTATIONS), - #expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]), - #expand(expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]), + #expand(OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"), trimmer=TRIMMERS, counter=COUNTERS, orientation=["all"]), + #expand(expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, counter=COUNTERS, biotype=COUNT_BIOTYPES, orientation=["all"]), expand(expand(OPJ( - output_dir, "{{trimmer}}", "figures", aligner, "{lib}_{rep}", + "{{trimmer}}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS), expand(expand(OPJ( - output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", + "{{trimmer}}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), filtered_product, lib=LIBS, rep=REPS), trimmer=TRIMMERS, norm_type=NORM_TYPES, orientation=["all"]), expand(OPJ( - output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", + "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"), trimmer=TRIMMERS, counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS), - #expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"]), - #expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES), + #expand(OPJ("{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=["protein_coding"]), + #expand(OPJ("{trimmer}", "figures", aligner, "{lib}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, orientation=["all"], biotype=METAGENE_BIOTYPES), # TODO: Add metagene profiles similar to small RNA-seq expand(OPJ( - output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", + "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all", "fwd", "rev"], biotype=METAGENE_BIOTYPES), @@ -448,14 +467,14 @@ rule map_on_genome: fastq = source_fastq, output: # sam files take a lot of space - sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")), - nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "not_mapped_C_elegans", "{lib}_{rep}_{type}_unmapped_on_C_elegans.fastq.gz"), + sam = temp(OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")), + nomap_fastq = OPJ("{trimmer}", aligner, "not_mapped_C_elegans", "{lib}_{rep}_{type}_unmapped_on_C_elegans.fastq.gz"), params: aligner = aligner, - index = index, + index = genome_db, settings = "", message: - "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.type} on C. elegans genome." + "Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.type} on %s genome." % genome log: log = OPJ(log_dir, "{trimmer}", "map_{type}_on_genome", "{lib}_{rep}.log"), err = OPJ(log_dir, "{trimmer}", "map_{type}_on_genome", "{lib}_{rep}.err"), @@ -474,10 +493,10 @@ rule map_on_genome: rule sam2indexedbam: input: - sam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam"), + sam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam"), output: - sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam"), - index = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam.bai"), + sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam"), + index = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans_sorted.bam.bai"), message: "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.type}." log: @@ -493,11 +512,11 @@ rule fuse_bams: """This rule fuses the two sorted bam files corresponding to the mapping of the reads containing the adaptor or not.""" input: - noadapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_noadapt_on_C_elegans_sorted.bam"), - adapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"), + noadapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_noadapt_on_C_elegans_sorted.bam"), + adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"), output: - sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"), - bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"), + sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"), + bai = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"), message: "Fusing sorted bam files for {wildcards.lib}_{wildcards.rep}" log: @@ -523,7 +542,7 @@ rule compute_coverage: input: sorted_bam = rules.fuse_bams.output.sorted_bam, output: - coverage = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_coverage.txt"), + coverage = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_coverage.txt"), params: genomelen = genomelen, shell: @@ -535,10 +554,10 @@ rule compute_coverage: rule check_last_base: input: - adapt_sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"), - adapt_index = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam.bai"), + adapt_sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam"), + adapt_index = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_sorted.bam.bai"), output: - OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt") + OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt") message: "Computing last base proportions for {wildcards.lib}_{wildcards.rep} (mapped reads from which the adaptor had been removed)." log: @@ -565,9 +584,9 @@ rule check_last_base: # TODO: use Python to make the plot rule plot_last_base: input: - OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt") + OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt") output: - OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf") + OPJ("{trimmer}", "figures", aligner, "{lib}_{rep}", "adapt_on_C_elegans_last_bases.pdf") params: title = lambda wildcards : "\"last base frequencies for %s_%s_%s\"" % (wildcards.trimmer, wildcards.lib, wildcards.rep) message: @@ -613,11 +632,11 @@ def biotype2annot(wildcards): rule htseq_count_reads: input: - sorted_bam = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"), - bai = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"), + sorted_bam = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"), + bai = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"), output: - counts = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"), - counts_converted = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"), + counts = OPJ("{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"), + counts_converted = OPJ("{trimmer}", aligner, "mapped_C_elegans", "htseq_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"), params: stranded = htseq_orientation2stranded, mode = "union", @@ -645,24 +664,26 @@ def parse_htseq_counts(counts_filename): rule feature_count_reads: input: sorted_bam = OPJ( - output_dir, "{trimmer}", aligner, "mapped_C_elegans", + "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam"), bai = OPJ( - output_dir, "{trimmer}", aligner, "mapped_C_elegans", + "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_sorted.bam.bai"), # TODO: Why does the following fail? #sorted_bam = rules.fuse_bams.output.sorted_bam, #index = rules.fuse_bams.output.index, output: counts = OPJ( - output_dir, "{trimmer}", aligner, "mapped_C_elegans", "feature_count", + "{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts.txt"), counts_converted = OPJ( - output_dir, "{trimmer}", aligner, "mapped_C_elegans", "feature_count", + "{trimmer}", aligner, "mapped_C_elegans", "feature_count", "{lib}_{rep}_on_C_elegans", "{biotype}_{orientation}_counts_gene_names.txt"), params: stranded = feature_orientation2stranded(LIB_TYPE), annot = biotype2annot, + # pickled dictionary that associates gene ids to gene names + converter = genome_dict["converter"] message: "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts." log: @@ -670,14 +691,13 @@ rule feature_count_reads: err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{orientation}_{biotype}", "{lib}_{rep}.err") shell: """ - converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle" tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX") cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}" featureCounts -v 2> {log.log} echo ${{cmd}} 1>> {log.log} eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed" rm -rf ${{tmpdir}} - cat {output.counts} | id2name.py ${{converter}} > {output.counts_converted} + cat {output.counts} | id2name.py {params.converter} > {output.counts_converted} """ @@ -720,11 +740,11 @@ def plot_scatterplot(outfile, data, data_groups, group2colour): rule do_PCA: input: - expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), + expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), output: - #OPJ(output_dir, aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.pdf"), - #OPJ(output_dir, aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.png"), - OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"), + #OPJ(aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.pdf"), + #OPJ(aligner, "mapped_C_elegans", "htseq_count", "summaries", "{biotype}_{orientation}_PCA.png"), + OPJ("{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.pdf"), message: "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}" threads: 12 # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]" @@ -746,7 +766,7 @@ rule do_PCA: lib_categories = np.fromiter(chain(*[[i] * nb_reps for i in range(nb_libs)]), dtype=np.uint32) for i, (lib, rep) in enumerate(product(LIBS, REPS)): counts_filename = OPJ( - output_dir, wildcards.trimmer, aligner, "mapped_C_elegans", wildcards.counter, + wildcards.trimmer, aligner, "mapped_C_elegans", wildcards.counter, "%s_%s_on_C_elegans" % (lib, rep), "%s_%s_counts.txt" % (wildcards.biotype, wildcards.orientation)) #print("Reading", counts_filename) counts[(lib, rep)] = OrderedDict(counts_parser(counts_filename)) @@ -791,9 +811,9 @@ rule do_PCA: rule summarize_counts: """For a given library, write a summary of the read counts for the various biotypes.""" input: - biotype_counts_files = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{{lib}}_{{rep}}_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), + biotype_counts_files = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{{lib}}_{{rep}}_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES), output: - summary = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "{lib}_{rep}_on_C_elegans_{orientation}_counts.txt") + summary = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "{lib}_{rep}_on_C_elegans_{orientation}_counts.txt") run: if wildcards.counter == "htseq_count": sum_counter = sum_htseq_counts @@ -812,12 +832,11 @@ rule summarize_counts: rule gather_read_counts_summaries: input: - summary_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "summaries", "{lib}_{rep}_on_C_elegans_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), + summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "summaries", "{lib}_{rep}_on_C_elegans_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), output: - summary_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "all_on_C_elegans_{orientation}_counts.txt"), + summary_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "summaries", "all_on_C_elegans_{orientation}_counts.txt"), run: summary_files = (OPJ( - output_dir, wildcards.trimmer, aligner, "mapped_C_elegans", @@ -834,9 +853,9 @@ rule gather_read_counts_summaries: rule gather_counts: """For a given biotype, gather counts from all libraries in one table.""" input: - counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), + counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "{lib}_{rep}_on_C_elegans", "{{biotype}}_{{orientation}}_counts.txt"), filtered_product, lib=LIBS, rep=REPS), output: - counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), + counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), wildcard_constraints: # Avoid ambiguity with join_all_counts biotype = "|".join(COUNT_BIOTYPES) @@ -844,7 +863,6 @@ rule gather_counts: # Gathering the counts data ############################ counts_files = (OPJ( - output_dir, wildcards.trimmer, aligner, "mapped_C_elegans", @@ -883,7 +901,7 @@ def source_counts_to_join(wildcards): Determines which elementary biotype counts files should be joined to make the desired "joined" biotype. """ return expand( - OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", + OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES_TO_JOIN[wildcards.biotype]) @@ -893,15 +911,15 @@ rule join_all_counts: """concat counts for all biotypes into all""" input: counts_tables = source_counts_to_join, - #counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]), + #counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]), # counts_tables = expand( - # OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", + # OPJ("{{trimmer}}", aligner, "mapped_C_elegans", # "{{counter}}", "all_on_C_elegans", # "{biotype}_{{orientation}}_counts.txt"), # # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}" # biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]), output: - counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), + counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), wildcard_constraints: biotype = "|".join(JOINED_BIOTYPES) run: @@ -926,10 +944,10 @@ rule compute_RPK: input: counts_data = source_counts, #counts_data = rules.gather_counts.output.counts_table, - #counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", + #counts_table = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", # "all_on_C_elegans", "{biotype}_{orientation}_counts.txt"), output: - rpk_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", + rpk_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_RPK.txt"), params: feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), @@ -947,7 +965,7 @@ rule compute_sum_million_RPK: input: rpk_file = rules.compute_RPK.output.rpk_file, output: - sum_rpk_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", + sum_rpk_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_sum_million_RPK.txt"), run: sum_rpk = pd.read_table( @@ -961,7 +979,7 @@ rule compute_TPM: input: rpk_file = rules.compute_RPK.output.rpk_file output: - tpm_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", + tpm_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_TPM.txt"), # The sum must be done over all counted features wildcard_constraints: @@ -993,7 +1011,7 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: counts_table = source_counts, #counts_table = rules.gather_counts.output.counts_table, output: - median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), + median_ratios_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), run: counts_data = pd.read_table( input.counts_table, @@ -1021,11 +1039,11 @@ def bamcoverage_filter(wildcards): def source_normalizer(wildcards): if wildcards.norm_type == "median_ratio_to_pseudo_ref": return OPJ( - output_dir, f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", + f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), elif wildcards.norm_type in COUNT_BIOTYPES: return OPJ( - output_dir, f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", + f"{wildcards.trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", f"{wildcards.norm_type}_fwd_sum_million_RPK.txt"), else: raise NotImplementedError(f"{wildcards.norm_type} normalization not implemented yet.") @@ -1038,12 +1056,12 @@ rule make_normalized_bigwig: # TODO: use sourcing function based on norm_type size_factor_file = source_normalizer, #size_factor_file = rules.compute_coverage.output.coverage - #median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), + #median_ratios_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), # TODO: compute this - #scale_factor_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), + #scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), output: - bigwig_norm = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{norm_type}_{orientation}.bw"), - #bigwig = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"), + bigwig_norm = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{norm_type}_{orientation}.bw"), + #bigwig = OPJ(aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"), #params: # orient_filter = bamcoverage_filter, threads: 4 # to limit memory usage, actually @@ -1082,12 +1100,12 @@ rule make_bigwig: bam = rules.fuse_bams.output.sorted_bam, # TODO: use sourcing function based on norm_type #size_factor_file = rules.compute_coverage.output.coverage - median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), + median_ratios_file = OPJ("{trimmer}", aligner, "mapped_C_elegans", COUNTERS[0], "all_on_C_elegans", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), # TODO: compute this - #scale_factor_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), + #scale_factor_file = OPJ(aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"), output: - bigwig_norm = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{norm_type}_{orientation}_bamCoverage.bw"), - #bigwig = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"), + bigwig_norm = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_by_{norm_type}_{orientation}_bamCoverage.bw"), + #bigwig = OPJ(aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_{orientation}.bw"), params: orient_filter = bamcoverage_filter, threads: 12 # to limit memory usage, actually @@ -1126,17 +1144,17 @@ your deepTools settings and check your input files. def source_bigwigs_for_merge(wildcards): - return [OPJ(output_dir, "{trimmer}".format(trimmer=wildcards.trimmer), aligner, "mapped_C_elegans", "{lib}_{{rep}}_on_C_elegans_by_{norm_type}_{orientation}.bw".format(lib=wildcards.lib, norm_type=wildcards.norm_type, orientation=wildcards.orientation).format(rep=rep)) for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden] - #return expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_norm_{orientation}.bw"), lib=[wildcards.lib], rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden], orientation=[wildcards.orientation]) + return [OPJ("{trimmer}".format(trimmer=wildcards.trimmer), aligner, "mapped_C_elegans", "{lib}_{{rep}}_on_C_elegans_by_{norm_type}_{orientation}.bw".format(lib=wildcards.lib, norm_type=wildcards.norm_type, orientation=wildcards.orientation).format(rep=rep)) for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden] + #return expand(OPJ(aligner, "mapped_C_elegans", "{lib}_{rep}_on_C_elegans_norm_{orientation}.bw"), lib=[wildcards.lib], rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden], orientation=[wildcards.orientation]) rule merge_bigwig_reps: """This rule merges bigwig files by computing a mean across replicates.""" input: source_bigwigs_for_merge, - #expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{{lib}}_{rep}_on_C_elegans_norm_{{orientation}}.bw"), rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden]), + #expand(OPJ(aligner, "mapped_C_elegans", "{{lib}}_{rep}_on_C_elegans_norm_{{orientation}}.bw"), rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden]), output: - bw = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"), + bw = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"), log: warnings = OPJ(log_dir, "{trimmer}", "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"), threads: 2 # to limit memory usage, actually @@ -1184,17 +1202,16 @@ rule differential_expression: counts_table = source_counts, summary_table = rules.gather_read_counts_summaries.output.summary_table, output: - deseq_results = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "deseq2.txt"), - up_genes = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "up_genes.txt"), - down_genes = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "down_genes.txt"), - counts_and_res = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"), + deseq_results = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "deseq2.txt"), + up_genes = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "up_genes.txt"), + down_genes = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "down_genes.txt"), + counts_and_res = OPJ("{trimmer}", aligner, "mapped_C_elegans", "{counter}", "deseq2", "{contrast}", "{orientation}_{biotype}", "counts_and_res.txt"), threads: 4 # to limit memory usage, actually run: counts_data = pd.read_table(input.counts_table, index_col="gene") summaries = pd.read_table(input.summary_table, index_col=0) # Running DESeq2 ################# - formula = Formula("~ lib") (cond, ref) = CONTRAST2PAIR[wildcards.contrast] if not any(counts_data[f"{ref}_{rep}"].any() for rep in REPS): warnings.warn( @@ -1203,73 +1220,89 @@ rule differential_expression: for outfile in output: shell(f"echo 'NA' > {outfile}") else: - contrast = StrVector(["lib", cond, ref]) try: - res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) - except RRuntimeError as e: - warnings.warn( - "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( - str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) - for outfile in output: - shell(f"echo 'NA' > {outfile}") - else: - # Determining fold-change category - ################################### - set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - res = res.assign(status=res.apply(set_de_status, axis=1)) - # Converting gene IDs - ###################### - with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: - res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) - with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: - res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) - # Just to see if column_converter works also with named column, and not just index: - # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: - # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) - ########################################## - # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") - res.to_csv(output.deseq_results, sep="\t", na_rep="NA") - # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status - counts_and_res = counts_data - for normalizer in SIZE_FACTORS: - if normalizer == "median_ratio_to_pseudo_ref": - ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but - ## add pseudo-count to compute the geometric mean, then remove it - #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 - #def median_ratio_to_pseudo_ref(col): - # return (col / pseudo_ref).median() - #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - #raise NotImplementedError(f"{normalizer} normalization not implemented") - size_factors = summaries.loc[normalizer] - by_norm = counts_data / size_factors - by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) - counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) - #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") - counts_and_res = pd.concat((counts_and_res, res), axis=1) - counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") - # Saving lists of genes gaining or loosing siRNAs - up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) - down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) - with open(output.up_genes, "w") as up_file: - if up_genes: - up_file.write("%s\n" % "\n".join(up_genes)) - else: - up_file.truncate(0) - with open(output.down_genes, "w") as down_file: - if down_genes: - down_file.write("%s\n" % "\n".join(down_genes)) - else: - down_file.truncate(0) + try: + contrast = StrVector(["lib", cond, ref]) + formula = Formula("~ lib") + res, size_factors = do_deseq2(COND_NAMES, CONDITIONS, counts_data, formula=formula, contrast=contrast) + except RRuntimeError as e: + warnings.warn( + "Probably not enough usable data points to perform DESeq2 analyses:\n%s\nSkipping %s_%s_%s" % ( + str(e), wildcards.contrast, wildcards.orientation, wildcards.biotype)) + for outfile in output: + shell(f"echo 'NA' > {outfile}") + else: + # Determining fold-change category + ################################### + set_de_status = status_setter(LFC_CUTOFFS, "log2FoldChange") + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + res = res.assign(status=res.apply(set_de_status, axis=1)) + # Converting gene IDs + ###################### + with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file: + res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1)) + with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file: + res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1)) + # Just to see if column_converter works also with named column, and not just index: + # with open(OPJ(convert_dir, "cosmid2name.pickle"), "rb") as dict_file: + # res = res.assign(name=res.apply(column_converter(load(dict_file), "cosmid"), axis=1)) + ########################################## + # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",") + res.to_csv(output.deseq_results, sep="\t", na_rep="NA") + # Joining counts and DESeq2 results in a same table and determining up- or down- regulation status + counts_and_res = counts_data + for normalizer in SIZE_FACTORS: + if normalizer == "median_ratio_to_pseudo_ref": + ## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but + ## add pseudo-count to compute the geometric mean, then remove it + #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1 + #def median_ratio_to_pseudo_ref(col): + # return (col / pseudo_ref).median() + #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0) + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) + else: + #raise NotImplementedError(f"{normalizer} normalization not implemented") + size_factors = summaries.loc[normalizer] + by_norm = counts_data / size_factors + by_norm.columns = by_norm.columns.map(lambda s: "%s_by_%s" % (s, normalizer)) + counts_and_res = pd.concat((counts_and_res, by_norm), axis=1) + #counts_and_res = add_tags_column(pd.concat((counts_and_res, res), axis=1).assign(status=res.apply(set_de_status, axis=1)), input.tags_table, "small_type") + counts_and_res = pd.concat((counts_and_res, res), axis=1) + counts_and_res.to_csv(output.counts_and_res, sep="\t", na_rep="NA") + # Saving lists of genes gaining or loosing siRNAs + up_genes = list(counts_and_res.query(f"status in {UP_STATUSES}").index) + down_genes = list(counts_and_res.query(f"status in {DOWN_STATUSES}").index) + with open(output.up_genes, "w") as up_file: + if up_genes: + up_file.write("%s\n" % "\n".join(up_genes)) + else: + up_file.truncate(0) + with open(output.down_genes, "w") as down_file: + if down_genes: + down_file.write("%s\n" % "\n".join(down_genes)) + else: + down_file.truncate(0) + # Does not seem to be caught... + except KeyError as err: + err_msg = str(err) + warnings.warn("XXXXXXXXXXXXXXXXX Got KeyError XXXXXXXXXXXXXXXXX") + if err_msg[:17] == "Trying to release": + warnings.warn(err_msg) + warnings.warn(f"Skipping {wildcards.contrast}_{wildcards.orientation}_{wildcards.biotype}\n") + for outfile in output: + shell(f"echo 'NA' > {outfile}") + else: + raise + except: + warnings.warn("XXXXXXXXXXXXXXXXX Got another exception XXXXXXXXXXXXXXXXX") + raise rule make_lfc_distrib_plot: input: deseq_results = rules.differential_expression.output.deseq_results, output: - lfc_plot = OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"), + lfc_plot = OPJ("{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "{fold_type}_distribution.pdf"), run: if test_na_file(input.deseq_results): warnings.warn( @@ -1297,7 +1330,7 @@ rule make_MA_plot: input: deseq_results = rules.differential_expression.output.deseq_results, output: - MA_plot = OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"), + MA_plot = OPJ("{trimmer}", "figures", aligner, "{counter}", "{contrast}", "{orientation}_{biotype}", "MA_with_{id_list}.pdf"), params: lfc_range = set_lfc_range, id_list = get_id_list, @@ -1667,7 +1700,7 @@ rule plot_meta_profile_mean: bigwig = rules.merge_bigwig_reps.output.bw, bed = rules.select_genes_for_meta_profile.output.out_bed, output: - OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), + OPJ("{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), params: meta_params = meta_params, # before = META_MARGIN, diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 472ed38e4cc6216e63de083d85ae6231f63e4a5b..9d94427fda04677dac8842934321a21a73155f97 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -250,10 +250,12 @@ avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) # bedtools intersect eats all the RAM when there are a lot of rRNAs: # https://github.com/arq5x/bedtools2/issues/565 COUNTERS = ["feature_count"] -output_dir = config["output_dir"] -transcriptomes_dir = OPJ(output_dir, "merged_transcriptomes") -log_dir = OPJ(output_dir, f"logs_{genome}") -data_dir = OPJ(output_dir, "data") +#output_dir = config["output_dir"] +#workdir: config["output_dir"] +output_dir = os.path.abspath(".") +transcriptomes_dir = OPJ("merged_transcriptomes") +log_dir = OPJ(f"logs_{genome}") +data_dir = OPJ("data") #NORM_TYPES = config["norm_types"] if spikein_microliter_equivalent: @@ -287,7 +289,7 @@ def specific_input(aligner): files = [] # files = [ # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "ballgown", + # OPJ(aligner, f"mapped_{genome}", "ballgown", # "{lib}_{rep}_on_%s.gtf" % genome), # lib=LIBS, rep=REPS), # OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "compare.stats"), @@ -302,65 +304,65 @@ def specific_input(aligner): counts_files = [ expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "{lib}_mean_{mapped_type}", "{lib}_mean_{biotype}_{orientation}_TPM.txt"), counter=COUNTERS, lib=LIBS, mapped_type=[f"on_{genome}"], biotype=["alltypes"], orientation=ORIENTATIONS), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_TPM.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=["alltypes"], orientation=ORIENTATIONS), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_RPK.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES + JOINED_BIOTYPES, orientation=ORIENTATIONS), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # OPJ(aligner, f"mapped_{genome}", "{counter}", # "summaries", f"{{lib}}_{{rep}}_on_{genome}_{{orientation}}_counts.txt"), # counter=COUNTERS, lib=LIBS, rep=REPS, orientation=ORIENTATIONS), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # OPJ(aligner, f"mapped_{genome}", "{counter}", # f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"), # counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # OPJ(aligner, f"mapped_{genome}", "{counter}", # f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"), # counter=COUNTERS, lib=LIBS, rep=REPS, biotype=BIOTYPES, orientation=ORIENTATIONS),] # Just a test: # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # OPJ(aligner, f"mapped_{genome}", "{counter}", # "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"), # counter=COUNTERS, lib=LIBS, rep=REPS, # mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], orientation=ORIENTATIONS), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # OPJ(aligner, f"mapped_{genome}", "{counter}", # "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), # counter=COUNTERS, lib=LIBS, rep=REPS, # mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # OPJ(aligner, f"mapped_{genome}", "{counter}", # "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), # counter=COUNTERS, lib=LIBS, rep=REPS, # mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),] expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt"), counter=COUNTERS, lib=LIBS, rep=REPS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], orientation=ORIENTATIONS), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), counter=COUNTERS, lib=LIBS, rep=REPS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), counter=COUNTERS, lib=LIBS, rep=REPS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], biotype=BIOTYPES, orientation=ORIENTATIONS),] @@ -370,7 +372,7 @@ counts_files = [ if contrasts_dict["ip"]: ip_fold_boxplots = [ expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"), #counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], counter=COUNTERS, mapped_type=[f"on_{genome}"], @@ -383,7 +385,7 @@ else: if contrasts_dict["de"]: de_fold_boxplots = [ expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], contrast_type=["de"], @@ -394,18 +396,18 @@ else: de_files = [ expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_{fold_type}_distribution.pdf"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, fold_type=["log2FoldChange"]), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_MA_with_{id_list}.pdf"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], contrast=DE_CONTRASTS, orientation=ORIENTATIONS, biotype=DE_BIOTYPES, @@ -414,23 +416,23 @@ de_files = [ bigwig_files = [ # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + # OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", # "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"), # lib=LIBS, rep=REPS, orientation=ORIENTATIONS, # mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES), # expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + # OPJ(aligner, f"mapped_{genome}", "{lib}_mean", # "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"), # lib=LIBS, orientation=ORIENTATIONS, # mapped_type=[f"on_{genome}", f"polyU_on_{genome}"], norm_type=NORM_TYPES),] expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"), lib=LIBS, rep=REPS, orientation=ORIENTATIONS, #mapped_type=[f"on_{genome}", f"unique_on_{genome}"], norm_type=NORM_TYPES), mapped_type=[f"on_{genome}"], norm_type=NORM_TYPES), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"), lib=LIBS, orientation=ORIENTATIONS, #mapped_type=[f"on_{genome}", f"unique_on_{genome}"], norm_type=NORM_TYPES),] @@ -438,7 +440,7 @@ bigwig_files = [ if spikein_microliter_equivalent: spikein_files = [ - expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + expand(OPJ(aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "{lib}_{rep}_spike_ins_{orientation}_TPM_response.pdf"), counter=COUNTERS, orientation=ORIENTATIONS, lib=LIBS, rep=REPS),] else: @@ -449,26 +451,26 @@ rule all: input: specific_input(aligner), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), + OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), lib=LIBS, rep=REPS), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"), + OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_coverage.txt"), lib=LIBS, rep=REPS), bigwig_files, spikein_files, # Uses too much memory when there are many libraries - # expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # expand(OPJ(aligner, f"mapped_{genome}", "{counter}", # f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"), # counter=COUNTERS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]), - # expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + # expand(OPJ(aligner, f"mapped_{genome}", "{counter}", # f"all_on_{genome}", "scatter_pairs/{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"), # counter=COUNTERS, biotype=PAIR_SCATTER_BIOTYPES, orientation=ORIENTATIONS, quantif_type=["counts", "RPK"]), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_correlations.pdf"), counter=COUNTERS, mapped_type=[f"on_{genome}"], orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_counts_distrib.pdf"), counter=COUNTERS, mapped_type=[f"on_{genome}"], orientation=ORIENTATIONS, biotype=BIOTYPES + ["alltypes"]), de_files, @@ -483,14 +485,14 @@ rule all: # results_44hph_vs_38hph/hisat2/mapped_C_elegans/{counter}/all_{mapped_type}/{biotype}_{orientation}_counts.txt # see https://bitbucket.org/snakemake/snakemake/issues/956/placeholders-not-properly-matched-with expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "replicates_comparison", "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], lib=LIBS, biotype=["alltypes"], orientation=ORIENTATIONS, quantif_type=["TPM"]), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "replicates_comparison", "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"), counter=COUNTERS, mapped_type=[f"on_{genome}", f"unique_on_{genome}"], @@ -534,9 +536,9 @@ rule map_on_genome: fastq = OPJ(data_dir, "{lib}_{rep}.fastq.gz"), output: # temp because it uses a lot of space - sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)), - #sam = temp(OPJ(output_dir, aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome), + sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)), + #sam = temp(OPJ(aligner, "mapped_%s" % genome, "{lib}_{rep}_on_%s.sam" % genome)), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome), params: aligner = aligner, index = genome_db, @@ -561,10 +563,10 @@ rule extract_nomap_polyU: Extract from the non-mappers those that end with a polyU tail of length from {minU} to {maxU}.""" input: - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome), output: - nomap_polyU = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU.fastq.gz" % genome), - nomap_polyU_stats = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU_stats.txt" % genome), + nomap_polyU = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU.fastq.gz" % genome), + nomap_polyU_stats = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s_polyU_stats.txt" % genome), run: stats = defaultdict(int) with gzopen(output.nomap_polyU, "wb") as fq_out: @@ -588,8 +590,8 @@ rule remap_on_genome: fastq = rules.extract_nomap_polyU.output.nomap_polyU, output: # temp because it might use a lot of space - sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_polyU_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_polyU_unmapped_on_%s.fastq.gz" % genome), + sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_polyU_on_%s.sam" % genome)), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_polyU_unmapped_on_%s.fastq.gz" % genome), params: aligner = aligner, index = genome_db, @@ -615,7 +617,7 @@ def source_sam(wildcards): ## debug # Why rules.map_on_genome.output.sam doesn't get properly expanded? #return rules.map_on_genome.output.sam - return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}_on_%s.sam" % genome) + return OPJ(aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}_on_%s.sam" % genome) ## elif mapped_type == f"polyU_on_{genome}": return rules.remap_on_genome.output.sam @@ -628,13 +630,13 @@ def source_sam(wildcards): rule sam2indexedbam: input: #debug_wildcards, - # sam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam"), + # sam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}.sam"), sam = source_sam, output: - # sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam")), - # index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai")), - sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam")), - index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam.bai")), + # sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam")), + # index = protected(OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai")), + sorted_bam = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam")), + index = protected(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_sorted.bam.bai")), message: "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.mapped_type}." log: @@ -651,9 +653,9 @@ rule sam2indexedbam: rule compute_mapping_stats: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, - #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), + #sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), output: - stats = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_samtools_stats.txt"), + stats = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_samtools_stats.txt"), shell: """samtools stats {input.sorted_bam} > {output.stats}""" @@ -662,9 +664,9 @@ rule get_nb_mapped: """Extracts the number of mapped reads from samtools stats results.""" input: stats = rules.compute_mapping_stats.output.stats, - #stats = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), + #stats = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_samtools_stats.txt"), output: - nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_nb_mapped.txt"), + nb_mapped = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_nb_mapped.txt"), shell: """samstats2mapped {input.stats} {output.nb_mapped}""" @@ -672,9 +674,9 @@ rule get_nb_mapped: rule compute_coverage: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, - #sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), + #sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), output: - coverage = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_coverage.txt"), + coverage = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}_{mapped_type}_coverage.txt"), params: genomelen = genomelen, shell: @@ -686,9 +688,9 @@ rule compute_coverage: rule assemble_transcripts: input: - sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), + sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), output: - gtf = OPJ(output_dir, aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), + gtf = OPJ(aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), params: annot = OPJ(annot_dir, "genes.gtf"), message: @@ -710,7 +712,7 @@ rule assemble_transcripts: rule merge_transcripts: input: - expand(OPJ(output_dir, aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), lib=LIBS, rep=REPS), + expand(OPJ(aligner, f"mapped_{genome}", "stringtie", f"{{lib}}_{{rep}}_on_{genome}.gtf"), lib=LIBS, rep=REPS), output: merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"), params: @@ -756,9 +758,9 @@ rule gffcompare: rule quantify_transcripts: input: merged = OPJ(transcriptomes_dir, aligner, f"mapped_{genome}", "stringtie", "merged.gtf"), - sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), + sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), output: - gtf = OPJ(output_dir, aligner, f"mapped_{genome}", "ballgown", f"{{lib}}_{{rep}}_on_{genome}.gtf"), + gtf = OPJ(aligner, f"mapped_{genome}", "ballgown", f"{{lib}}_{{rep}}_on_{genome}.gtf"), message: "Quantifying transcripts for {wildcards.lib}_{wildcards.rep} with stringtie." log: @@ -807,11 +809,11 @@ def biotype2annot(wildcards): # rule htseq_count_reads: # input: -# sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), -# bai = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"), +# sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), +# bai = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"), # output: -# counts = OPJ(output_dir, aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"), -# counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"), +# counts = OPJ(aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"), +# counts_converted = OPJ(aligner, f"mapped_{genome}", "htseq_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"), # params: # stranded = htseq_orientation2stranded, # mode = "union", @@ -834,7 +836,7 @@ def source_sorted_bam(wildcards): else: real_mapped_type = mapped_type return OPJ( - output_dir, aligner, f"mapped_{genome}", + aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}_{real_mapped_type}_sorted.bam") @@ -845,7 +847,7 @@ def source_index(wildcards): else: real_mapped_type = mapped_type return OPJ( - output_dir, aligner, f"mapped_{genome}", + aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}_{real_mapped_type}_sorted.bam.bai") @@ -856,8 +858,8 @@ rule feature_count_reads: #sorted_bam = rules.sam2indexedbam.output.sorted_bam, #index = rules.sam2indexedbam.output.index, output: - counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), - counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), + counts = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), + counts_converted = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), #wildcard_constraints: # mapped_type = "|".join([f"on_{genome}", f"polyU_on_{genome}"]) params: @@ -892,8 +894,8 @@ rule feature_count_reads: # #sorted_bam = rules.sam2indexedbam.output.sorted_bam, # #index = rules.sam2indexedbam.output.index, # output: -# counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), -# counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), +# counts = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), +# counts_converted = OPJ(aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_{mapped_type}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), # #wildcard_constraints: # # mapped_type = f"unique_on_{genome}" # params: @@ -953,11 +955,11 @@ rule feature_count_reads: # # rule intersect_count_reads: # input: -# sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), -# bai = OPJ(output_dir, aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"), +# sorted_bam = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam"), +# bai = OPJ(aligner, f"mapped_{genome}", f"{{lib}}_{{rep}}_on_{genome}_sorted.bam.bai"), # output: -# counts = OPJ(output_dir, aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), -# counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), +# counts = OPJ(aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts.txt"), +# counts_converted = OPJ(aligner, f"mapped_{genome}", "intersect_count", f"{{lib}}_{{rep}}_on_{genome}", "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"), # params: # stranded = intersect_orientation2stranded, # annot = biotype2merged, @@ -976,9 +978,9 @@ rule feature_count_reads: rule summarize_counts: """For a given library, write a summary of the read counts for the various biotypes.""" input: - biotype_counts_files = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "{{lib}}_{{rep}}_{{mapped_type}}", "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES), + biotype_counts_files = expand(OPJ(aligner, f"mapped_{genome}", "{{counter}}", "{{lib}}_{{rep}}_{{mapped_type}}", "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES), output: - summary = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt") + summary = OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "{lib}_{rep}_{mapped_type}_{orientation}_counts.txt") run: if wildcards.counter == "htseq_count": sum_counter = sum_htseq_counts @@ -999,16 +1001,16 @@ rule gather_read_counts_summaries: """Gather read count summaries across libraries: lib_rep -> all.""" input: summary_tables = expand(OPJ( - output_dir, aligner, f"mapped_{genome}", "{{counter}}", "summaries", + aligner, f"mapped_{genome}", "{{counter}}", "summaries", "{name}_{{mapped_type}}_{{orientation}}_counts.txt"), name=COND_NAMES), output: summary_table = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", + aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_counts.txt"), run: #summary_files = (OPJ( summary_files = [OPJ( - output_dir, aligner, f"mapped_{genome}", wildcards.counter, "summaries", + aligner, f"mapped_{genome}", wildcards.counter, "summaries", f"{cond_name}_{wildcards.mapped_type}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES] #orientation=wildcards.orientation)) for cond_name in COND_NAMES) try: @@ -1024,13 +1026,13 @@ rule gather_counts: """For a given biotype, gather counts from all libraries in one table.""" input: counts_tables = expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", + OPJ(aligner, f"mapped_{genome}", "{{counter}}", "{cond_name}_{{mapped_type}}", "{cond_name}_{{biotype}}_{{orientation}}_counts.txt"), cond_name=COND_NAMES), output: counts_table = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", + aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), wildcard_constraints: # Avoid ambiguity with join_all_counts @@ -1040,7 +1042,6 @@ rule gather_counts: ############################ #counts_files = (OPJ( counts_files = [OPJ( - output_dir, aligner, f"mapped_{genome}", wildcards.counter, @@ -1081,10 +1082,10 @@ rule compute_RPK: input: # TODO: Why wildcards seems to be None? #counts_data = rules.gather_counts.output.counts_table, - counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + counts_data = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), output: - rpk_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + rpk_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_RPK.txt"), params: feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"), @@ -1102,7 +1103,7 @@ rule compute_sum_million_RPK: input: rpk_file = rules.compute_RPK.output.rpk_file, output: - sum_rpk_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + sum_rpk_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_sum_million_RPK.txt"), run: sum_rpk = pd.read_table( @@ -1120,7 +1121,7 @@ rule compute_TPM: input: rpk_file = rules.compute_RPK.output.rpk_file output: - tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + tpm_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_TPM.txt"), # The sum must be done over all counted features wildcard_constraints: @@ -1138,7 +1139,7 @@ rule compute_mean_TPM: input: all_tmp_file = rules.compute_TPM.output.tpm_file output: - tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + tpm_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "{lib}_mean_{mapped_type}", "{lib}_mean_{biotype}_{orientation}_TPM.txt"), run: tpm = pd.read_table( @@ -1152,14 +1153,14 @@ rule compute_mean_TPM: rule compute_RPM: input: - counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + counts_data = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), #summary_table = rules.gather_read_counts_summaries.output.summary_table, summary_table = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", + aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_fwd_counts.txt"), output: - rpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + rpm_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_RPM.txt"), run: # Reading column counts from {input.counts_table} @@ -1178,22 +1179,22 @@ def source_quantif(wildcards): if wildcards.quantif_type == "counts": # return rules.gather_counts.output.counts_table return OPJ( - output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt") elif wildcards.quantif_type == "RPK": # return rules.compute_RPK.output.rpk_file - return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + return OPJ(aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_RPK.txt") elif wildcards.quantif_type == "TPM": # return rules.compute_TPM.output.tpm_file - return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + return OPJ(aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_TPM.txt") elif wildcards.quantif_type == "RPM": #return rules.compute_RPM.output.rpm_file - return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + return OPJ(aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_RPM.txt") else: @@ -1205,14 +1206,14 @@ rule compare_replicates: input: # AttributeError: 'NoneType' object has no attribute 'keys' # counts_data = rules.gather_counts.output.counts_table, - #counts_data = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + #counts_data = OPJ(aligner, f"mapped_{genome}", "{counter}", # f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"), data_file = source_quantif, # MissingInputException in line 1174 of /home/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile: # Missing input files for rule compare_replicates: # results_prg1_KO_48hph/hisat2/mapped_C_elegans/{counter}/all_{mapped_type}/{biotype}_{orientation}_TPM.txt output: - corr_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + corr_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "replicates_comparison", "{lib}", "{biotype}_{orientation}_{quantif_type}_correlations.txt"), run: @@ -1266,7 +1267,7 @@ rule plot_biotype_scatter_pairs: input: data_file = source_quantif, output: - plot_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + plot_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "scatter_pairs", "{biotype}_{orientation}_{quantif_type}_scatter_pairs.pdf"), run: @@ -1332,12 +1333,12 @@ def do_linear_regression(data, x_col, y_col, y_min=0, fit_intercept=True, transf rule plot_spikein_responses: """Plot log2(TPM) vs log2(spikein_expected_counts), for each library.""" input: - tpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + tpm_file = OPJ(aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "alltypes_{orientation}_TPM.txt"), output: - response_plots = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", + response_plots = expand(OPJ(aligner, f"mapped_{genome}", "{{counter}}", f"all_on_{genome}", "{lib}_{rep}_spike_ins_{{orientation}}_TPM_response.pdf"), lib=LIBS, rep=REPS), - # spikein_slope_files = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "spike_ins_{orientation}_TPM_response_slope.txt"), + # spikein_slope_files = OPJ(aligner, f"mapped_{genome}", "{counter}", f"all_on_{genome}", "spike_ins_{orientation}_TPM_response_slope.txt"), run: tpm_table = pd.read_table(input.tpm_file, index_col="gene") common = tpm_table.index.intersection(spikein_expected_counts.index) @@ -1354,7 +1355,7 @@ rule plot_spikein_responses: rel_sq_diffs_list = [] for libname in data.columns[:-1]: filename = OPJ( - output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_on_{genome}", f"{libname}_spike_ins_{wildcards.orientation}_TPM_response.pdf") save_plot( @@ -1392,7 +1393,7 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: input: counts_table = rules.gather_counts.output.counts_table, output: - median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), + median_ratios_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_median_ratios_to_pseudo_ref.txt"), run: counts_data = pd.read_table( input.counts_table, @@ -1409,12 +1410,12 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: def source_normalizer(wildcards): if wildcards.norm_type == "median_ratio_to_pseudo_ref": return OPJ( - output_dir, aligner, f"mapped_{genome}", COUNTERS[0], + aligner, f"mapped_{genome}", COUNTERS[0], f"all_{wildcards.mapped_type}", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt") elif wildcards.norm_type in BIOTYPES: return OPJ( - output_dir, aligner, f"mapped_{genome}", COUNTERS[0], + aligner, f"mapped_{genome}", COUNTERS[0], f"all_{wildcards.mapped_type}", f"{wildcards.norm_type}_fwd_sum_million_RPK.txt") else: @@ -1427,10 +1428,10 @@ rule make_normalized_bigwig: bam = rules.sam2indexedbam.output.sorted_bam, # Take the column out of summaries, and write it in similar format than deseq-style values size_factor_file = source_normalizer, - #size_factor_file = OPJ(output_dir, aligner, f"mapped_{genome}", COUNTERS[0], f"all_on_{genome}", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), - #summary_table = OPJ(output_dir, aligner, f"mapped_{genome}", COUNTERS[0], "summaries", f"all_on_{genome}_fwd_counts.txt"), + #size_factor_file = OPJ(aligner, f"mapped_{genome}", COUNTERS[0], f"all_on_{genome}", "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"), + #summary_table = OPJ(aligner, f"mapped_{genome}", COUNTERS[0], "summaries", f"all_on_{genome}_fwd_counts.txt"), output: - bigwig_norm = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"), + bigwig_norm = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{orientation}_{mapped_type}_by_{norm_type}.bw"), params: genome_binned = genome_binned, # orient_filter = genomecov_filter, @@ -1452,9 +1453,9 @@ rule make_normalized_bigwig: rule merge_bigwig_reps: """This rule merges bigwig files by computing a mean across replicates.""" input: - expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{lib}}_{rep}", "{{lib}}_{rep}_{{orientation}}_{{mapped_type}}_by_{{norm_type}}.bw"), rep=REPS), + expand(OPJ(aligner, f"mapped_{genome}", "{{lib}}_{rep}", "{{lib}}_{rep}_{{orientation}}_{{mapped_type}}_by_{{norm_type}}.bw"), rep=REPS), output: - bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"), + bw = OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.bw"), log: warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_{orientation}_{mapped_type}_by_{norm_type}.warnings"), threads: 12 # to limit memory usage, actually @@ -1501,7 +1502,7 @@ def source_counts_to_join(wildcards): Determines which elementary biotype counts files should be joined to make the desired "joined" biotype. """ return expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{{counter}}", "all_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"), + OPJ(aligner, f"mapped_{genome}", "{{counter}}", "all_{{mapped_type}}", "{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES_TO_JOIN[wildcards.biotype]) @@ -1510,7 +1511,7 @@ rule join_all_counts: input: counts_tables = source_counts_to_join, output: - counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), + counts_table = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_counts.txt"), wildcard_constraints: biotype = "|".join(JOINED_BIOTYPES) run: @@ -1526,7 +1527,7 @@ def source_counts(wildcards): if wildcards.biotype in JOINED_BIOTYPES: ## debug return rules.join_all_counts.output.counts_table - #return OPJ(output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"alltypes_{wildcards.orientation}_counts.txt") + #return OPJ(aligner, f"mapped_{genome}", f"{wildcards.counter}", f"all_{wildcards.mapped_type}", f"alltypes_{wildcards.orientation}_counts.txt") ## else: # "Directly" from the counts gathered across libraries @@ -1537,12 +1538,12 @@ def source_counts(wildcards): # print(rules.gather_counts.output.counts_table) # print(wildcards) # print(OPJ( - # output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + # aligner, f"mapped_{genome}", f"{wildcards.counter}", # f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt")) # sys.exit(1) return rules.gather_counts.output.counts_table #return OPJ( - # output_dir, aligner, f"mapped_{genome}", f"{wildcards.counter}", + # aligner, f"mapped_{genome}", f"{wildcards.counter}", # f"all_{wildcards.mapped_type}", f"{wildcards.biotype}_{wildcards.orientation}_counts.txt") ## @@ -1552,8 +1553,8 @@ rule test_size_factor: counts_table = source_counts, summary_table = rules.gather_read_counts_summaries.output.summary_table, output: - norm_correlations = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_correlations.pdf"), - norm_counts_distrib_plot = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_counts_distrib.pdf"), + norm_correlations = OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_correlations.pdf"), + norm_counts_distrib_plot = OPJ(aligner, f"mapped_{genome}", "{counter}", "summaries", "all_{mapped_type}_{orientation}_{biotype}_norm_counts_distrib.pdf"), params: size_factor_types = SIZE_FACTORS, norm_corr_plot_title = lambda wildcards : f"Correlation across samples between normalized {wildcards.orientation}_{wildcards.biotype}_{wildcards.mapped_type} counts and size factors.", @@ -1702,12 +1703,12 @@ rule test_size_factor: rule compute_RPM_folds: input: - rpm_file = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", + rpm_file = OPJ(aligner, f"mapped_{genome}", "{counter}", "all_{mapped_type}", "{biotype}_{orientation}_RPM.txt"), summary_table = rules.gather_read_counts_summaries.output.summary_table, output: fold_results = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "RPM_folds_{mapped_type}", + aligner, f"mapped_{genome}", "{counter}", "RPM_folds_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_RPM_folds.txt"), run: (cond, ref) = CONTRAST2PAIR[wildcards.contrast] @@ -1739,16 +1740,16 @@ rule differential_expression: summary_table = rules.gather_read_counts_summaries.output.summary_table, output: deseq_results = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", + aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_deseq2.txt"), up_genes = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", + aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_up_genes.txt"), down_genes = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", + aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_down_genes.txt"), counts_and_res = OPJ( - output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", + aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_counts_and_res.txt"), threads: 4 # to limit memory usage, actually run: @@ -1836,7 +1837,7 @@ rule make_lfc_distrib_plot: input: deseq_results = rules.differential_expression.output.deseq_results, output: - lfc_plot = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_{fold_type}_distribution.pdf"), + lfc_plot = OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_{fold_type}_distribution.pdf"), run: if test_na_file(input.deseq_results): warnings.warn( @@ -1867,7 +1868,7 @@ rule make_MA_plot: input: deseq_results = rules.differential_expression.output.deseq_results, output: - MA_plot = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_MA_with_{id_list}.pdf"), + MA_plot = OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}", "{contrast}_MA_with_{id_list}.pdf"), params: lfc_range = set_lfc_range, id_list = get_id_list, @@ -1905,7 +1906,7 @@ def source_fold_data(wildcards): if fold_type in {"log2FoldChange", "lfcMLE"}: if hasattr(wildcards, "contrast_type"): return [filename_template.format(**wildcards) for filename_template in expand( - OPJ(output_dir, aligner, f"mapped_{genome}", + OPJ(aligner, f"mapped_{genome}", "{{counter}}", "deseq2_{{mapped_type}}", "{contrast}", "{{orientation}}_{{biotype}}", "{contrast}_counts_and_res.txt"), contrast=contrasts_dict[wildcards.contrast_type])] @@ -1914,7 +1915,7 @@ def source_fold_data(wildcards): elif fold_type == "mean_log2_RPM_fold": if hasattr(wildcards, "contrast_type"): return [filename_template.format(**wildcards) for filename_template in expand( - OPJ(output_dir, aligner, f"mapped_{genome}", + OPJ(aligner, f"mapped_{genome}", "{{counter}}", "RPM_folds_{{mapped_type}}", "{contrast}", "{{orientation}}_{{biotype}}", "{contrast}_RPM_folds.txt"), contrast=contrasts_dict[wildcards.contrast_type])] @@ -1936,7 +1937,7 @@ rule make_contrast_lfc_boxplots: input: data = source_fold_data, output: - boxplots = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", + boxplots = OPJ(aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}_boxplots.pdf"), log: warnings = OPJ(log_dir, "make_contrast_lfc_boxplots", aligner, f"mapped_{genome}", "{counter}", "fold_boxplots_{mapped_type}", "{contrast_type}_{orientation}_{biotype}_{fold_type}_{id_list}.warnings"), @@ -2004,10 +2005,10 @@ rule make_contrast_lfc_boxplots: # up_genes_in = rules.differential_expression.output.up_genes, # down_genes_in = rules.differential_expression.output.down_genes, # output: -# deseq_results_out = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_deseq2_{id_type}.txt"), -# counts_and_res_out = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_counts_and_res_{id_type}.txt"), -# up_genes_out = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_up_genes_{id_type}.txt"), -# down_genes_out = OPJ(output_dir, aligner, f"mapped_{genome}", "{counter}', "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_down_genes_{id_type}.txt"), +# deseq_results_out = OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_deseq2_{id_type}.txt"), +# counts_and_res_out = OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_counts_and_res_{id_type}.txt"), +# up_genes_out = OPJ(aligner, f"mapped_{genome}", "{counter}", "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_up_genes_{id_type}.txt"), +# down_genes_out = OPJ(aligner, f"mapped_{genome}", "{counter}', "deseq2_{mapped_type}", "{contrast}", "{orientation}_{biotype}_down_genes_{id_type}.txt"), # run: # with open(OPJ(convert_dir, "wormid2{id_type}.pickle".format(id_type=wildcards.id_type)), "rb") as dict_file: # converter = load(dict_file) diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index 056ec1fa30119505db25ce733f1932f766a4ec2e..7701702344d5b6c12669a9e1eeb5c19a4c807a47 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -287,13 +287,15 @@ gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) index = genome_db -output_dir = config["output_dir"] -local_annot_dir = config.get("local_annot_dir", OPJ(output_dir, "annotations")) -log_dir = config.get("log_dir", OPJ(output_dir, "logs")) -data_dir = config.get("data_dir", OPJ(output_dir, "data")) +#output_dir = config["output_dir"] +#workdir: config["output_dir"] +output_dir = os.path.abspath(".") +local_annot_dir = config.get("local_annot_dir", OPJ("annotations")) +log_dir = config.get("log_dir", OPJ("logs")) +data_dir = config.get("data_dir", OPJ("data")) counter = "feature_count" -counts_dir = OPJ(output_dir, aligner, f"mapped_{genome}", counter) +counts_dir = OPJ(aligner, f"mapped_{genome}", counter) # Used to skip some genotype x treatment x replicate number combinations # when some of them were not sequenced forbidden = {frozenset(wc_comb.items()) for wc_comb in config["missing"]} @@ -399,12 +401,12 @@ shell.prefix(SHELL_FUNCTIONS) bigwig_files = [ # individual libraries expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), # means of replicates expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), lib=LIBS, read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), ] @@ -446,28 +448,28 @@ meta_profiles = [ #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]), # TODO: check how to adapt to dt_series instead of ip_series expand( - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.pdf"), meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"], norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)], biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)], series_type=["dt_series"], series=list(dt_series.keys())), expand( - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.pdf"), meta_scale= [str(META_SCALE)], read_type=[size_selected, "RPF"], norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding"], min_dist=[str(MIN_DIST)], biotype=["protein_coding"], min_len=[str(META_MIN_LEN)], series_type=["dt_series"], series=list(dt_series.keys())), expand( - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.pdf"), meta_scale=[str(META_SCALE)], read_type=[size_selected, "RPF"], norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=["0"], id_list=GENE_LISTS, series_type=["dt_series"], series=list(dt_series.keys())), ## TODO: Resolve issue with bedtools # expand( - # OPJ(output_dir, "figures", "{lib}_{rep}", + # OPJ("figures", "{lib}_{rep}", # "{read_type}_by_{norm}_{orientation}_pi_targets_in_{biotype}_profile.pdf"), # lib=LIBS, rep=REPS, read_type=[size_selected, "RPF"], # norm=SIZE_FACTORS, orientation=["all"], biotype=["protein_coding"]), @@ -475,26 +477,26 @@ meta_profiles = [ read_graphs = [ expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"), + OPJ("figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"), lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"), + OPJ("figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"), lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"), + OPJ("figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"), lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"), + OPJ("figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"), lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), + OPJ("figures", "{lib}_{rep}", "{read_type}_size_distribution.pdf"), lib=LIBS, rep=REPS, read_type=["trimmed"]), # Not relevant in Ribo-seq? #expand( - # OPJ(output_dir, "figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.pdf"), + # OPJ("figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.pdf"), # lib=LIBS, rep=REPS), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"), + OPJ("figures", "{lib}_{rep}", "nb_reads.pdf"), lib=LIBS, rep=REPS), ] @@ -502,16 +504,16 @@ read_graphs = [ # What should be done with small_type, fold_type, etc. if contrasts_dict["dt"]: fold_heatmaps = expand( - OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), + OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold", "log2FoldChange"]) ip_fold_boxplots = expand( - OPJ(output_dir, "figures", "all_{contrast_type}", + OPJ("figures", "all_{contrast_type}", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], gene_list=BOXPLOT_GENE_LISTS) else: fold_heatmaps = expand( - OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), + OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), small_type=["pisimi", "prot_si"], fold_type=["log2FoldChange"]) ip_fold_boxplots = [] @@ -519,34 +521,34 @@ exploratory_graphs = [ fold_heatmaps, # Large figures, not very readable #expand( - # OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.pdf"), + # OPJ(aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.pdf"), # contrast=DE_CONTRASTS, small_type=DE_TYPES), ## TODO: debug PCA #expand( - # OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"), + # OPJ("figures", "{small_type}_{standard}_PCA.pdf"), # small_type=["pisimi"], standard=STANDARDS), #expand( - # OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), + # OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), # small_type=["pisimi"], standard=STANDARDS), ## #expand( - # OPJ(output_dir, "figures", "{small_type}_clustermap.pdf"), + # OPJ("figures", "{small_type}_clustermap.pdf"), # small_type=SMALL_TYPES), #expand( - # OPJ(output_dir, "figures", "{small_type}_zscore_clustermap.pdf"), + # OPJ("figures", "{small_type}_zscore_clustermap.pdf"), # small_type=SMALL_TYPES), #expand( - # OPJ(output_dir, "figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), + # OPJ("figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), # contrast=CONTRASTS, small_type=DE_TYPES), ] de_fold_boxplots = expand( - OPJ(output_dir, "figures", "{contrast}", + OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), contrast=DE_CONTRASTS, small_type=DE_TYPES, fold_type=["log2FoldChange", "mean_log2_RPM_fold"], gene_list=["all_gene_lists"]) ip_fold_boxplots_by_contrast = expand( - OPJ(output_dir, "figures", "{contrast}", + OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), contrast=DT_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], gene_list=["all_gene_lists"]) @@ -556,14 +558,14 @@ fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplot rule all: input: expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), + OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), lib=LIBS, rep=REPS, read_type=[size_selected]), expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), + OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), lib=LIBS, rep=REPS, read_type=[size_selected]), # In read_graphs #expand( - # OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"), + # OPJ("figures", "{lib}_{rep}", "nb_reads.pdf"), # lib=LIBS, rep=REPS), read_graphs, counts_files, @@ -596,22 +598,22 @@ rule all: # rule future_all: # meta_profiles, # read_graphs, -# OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), -# expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), -# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=DT_CONTRASTS, small_type=DE_TYPES), +# OPJ(aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), +# expand(OPJ(aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), +# #expand(OPJ(aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=DT_CONTRASTS, small_type=DE_TYPES), # exploratory_graphs, # fold_boxplots, -# #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES), +# #expand(OPJ("figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES), # # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting # # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot # expand( # OPJ(counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), # small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), # expand( -# OPJ(output_dir, "figures", "{small_type}_norm_correlations.pdf"), +# OPJ("figures", "{small_type}_norm_correlations.pdf"), # small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), # expand( -# OPJ(output_dir, "figures", "{small_type}_norm_counts_distrib.pdf"), +# OPJ("figures", "{small_type}_norm_counts_distrib.pdf"), # small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), include: ensure_relative(irules["link_raw_data"], workflow.basedir) @@ -732,8 +734,8 @@ rule map_on_genome: # fastq = rules.select_size_range.output.selected, fastq = source_fastq, output: - sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), + sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), params: aligner = aligner, index = index, @@ -754,12 +756,12 @@ rule map_on_genome: def source_sam(wildcards): if hasattr(wildcards, "read_type"): return OPJ( - output_dir, aligner, f"mapped_{genome}", + aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", f"{wildcards.read_type}_on_{genome}.sam") else: return OPJ( - output_dir, aligner, f"mapped_{genome}", + aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", f"{size_selected}_on_{genome}.sam") @@ -768,8 +770,8 @@ rule sam2indexedbam: input: sam = source_sam, output: - sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam" % genome), - index = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam.bai" % genome), + sorted_bam = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam" % genome), + index = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam.bai" % genome), message: "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." benchmark: @@ -787,7 +789,7 @@ rule compute_mapping_stats: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, output: - stats = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_samtools_stats.txt" % genome), + stats = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_samtools_stats.txt" % genome), shell: """samtools stats {input.sorted_bam} > {output.stats}""" @@ -797,7 +799,7 @@ rule get_nb_mapped: input: stats = rules.compute_mapping_stats.output.stats, output: - nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), + nb_mapped = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), shell: """samstats2mapped {input.stats} {output.nb_mapped}""" @@ -806,7 +808,7 @@ rule compute_coverage: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, output: - coverage = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), + coverage = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), params: genomelen = genomelen, shell: @@ -822,13 +824,13 @@ rule extract_RPF: on "non-coding" regions.""" input: sorted_bam = OPJ( - output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_sorted.bam"), protein_gtf = OPJ(annot_dir, "protein_coding.gtf") output: - rpf = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.fastq.gz") + rpf = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.fastq.gz") params: - tmp_filtered_bam = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.bam"), + tmp_filtered_bam = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.bam"), shell: """ mkfifo {params.tmp_filtered_bam} @@ -854,7 +856,7 @@ def source_bams(wildcards): """We can count from several bam files with feature_count.""" read_types = wildcards.read_type.split("_and_") sorted_bams = [OPJ( - output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", + aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", f"{read_type}_on_{genome}_sorted.bam") for read_type in read_types] return sorted_bams @@ -909,7 +911,7 @@ rule count_non_structural_mappers: input: # nb_mapped is the total number of size-selected that mapped #nb_mapped = rules.get_nb_mapped.output.nb_mapped, - nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"), + nb_mapped = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"), # structural are fwd to STRUCTURAL_BIOTYPES summary = OPJ( counts_dir, "summaries", @@ -955,10 +957,10 @@ rule plot_read_numbers: # nb_trimmed = rules.trim_and_dedup.output.nb_trimmed, nb_trimmed = rules.trim.output.nb_trimmed, nb_selected = rules.select_size_range.output.nb_selected, - nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"), + nb_mapped = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"), nb_RPF = rules.count_RPF.output.nb_RPF, output: - plot = OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"), + plot = OPJ("figures", "{lib}_{rep}", "nb_reads.pdf"), message: "Plotting read numbers for {wildcards.lib}_{wildcards.rep}." # Currently not used @@ -1002,7 +1004,7 @@ rule make_read_counts_summary: nb_RPF = rules.count_RPF.output.nb_RPF, output: summary = OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", "{lib}_{rep}_{read_type}_on_%s_read_counts.txt" % genome), #threads: 8 # to limit memory usage, actually run: @@ -1032,15 +1034,15 @@ rule make_read_counts_summary: rule gather_read_counts_summaries: input: summary_tables = expand(OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", "{name}_%s_on_%s_read_counts.txt" % (size_selected, genome)), name=COND_NAMES), output: summary_table = OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", f"all_{size_selected}_on_{genome}_read_counts.txt"), run: summary_files = (OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", f"{cond_name}_{size_selected}_on_{genome}_read_counts.txt") for cond_name in COND_NAMES) summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1) summaries.columns = COND_NAMES @@ -1428,7 +1430,7 @@ rule compute_RPM_folds: summary_table = rules.gather_read_counts_summaries.output.summary_table, output: fold_results = OPJ( - output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", + aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), log: log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{small_type}.log"), @@ -1478,16 +1480,16 @@ rule gather_RPM_folds: """Gathers RPM folds across contrasts.""" input: fold_results = expand(OPJ( - output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", + aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=DT_CONTRASTS), output: all_folds = OPJ( - output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), + aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), log: log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"), run: pd.DataFrame({contrast : pd.read_table( - OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", + OPJ(aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), index_col=["gene", "cosmid", "name", "small_type"], usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in DT_CONTRASTS}).to_csv( @@ -1566,7 +1568,7 @@ rule make_fold_heatmap: input: all_folds = source_gathered_folds, output: - fold_heatmap = OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), + fold_heatmap = OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), benchmark: OPJ(log_dir, "make_fold_heatmap", "{small_type}_{fold_type}_benchmark.txt"), threads: 16 # to limit memory usage, actually @@ -1605,8 +1607,8 @@ rule test_size_factor: counts_table = source_counts, summary_table = rules.gather_read_counts_summaries.output.summary_table, output: - norm_correlations = OPJ(output_dir, "figures", "{small_type}_norm_correlations.pdf"), - norm_counts_distrib_plot = OPJ(output_dir, "figures", "{small_type}_norm_counts_distrib.pdf"), + norm_correlations = OPJ("figures", "{small_type}_norm_correlations.pdf"), + norm_counts_distrib_plot = OPJ("figures", "{small_type}_norm_counts_distrib.pdf"), params: size_factor_types = TESTED_SIZE_FACTORS, norm_corr_plot_title = lambda wildcards : f"Correlation across samples between normalized {wildcards.small_type}RNA counts and size factors.", @@ -1744,9 +1746,9 @@ rule make_normalized_bigwig: input: bam = rules.sam2indexedbam.output.sorted_bam, norm_file = source_norm_file, - #median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "all_si_median_ratios_to_pseudo_ref.txt"), + #median_ratios_file = OPJ(aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "all_si_median_ratios_to_pseudo_ref.txt"), output: - bigwig_norm = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), + bigwig_norm = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), params: #orient_filter = bamcoverage_filter, genome_binned = genome_binned, @@ -1821,9 +1823,9 @@ bam2bigwig.sh: bedGraphToBigWig failed rule merge_bigwig_reps: """This rule merges bigwig files by computing a mean across replicates.""" input: - expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{lib}}_{rep}", "{{lib}}_{rep}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), rep=REPS), + expand(OPJ(aligner, f"mapped_{genome}", "{{lib}}_{rep}", "{{lib}}_{rep}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), rep=REPS), output: - bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), + bw = OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), log: warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.warnings" % genome), threads: 2 # to limit memory usage, actually @@ -1867,10 +1869,10 @@ rule merge_bigwig_reps: rule make_bigwig_ratios: """This rule tries to make a bigwig file displaying the ratio between a condition and a reference, from the corresponding two normalized bigwig files.""" input: - cond_bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{cond}_{{rep}}", "{cond}_{{rep}}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), - ref_bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{ref}_{{rep}}", "{ref}_{{rep}}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), + cond_bw = OPJ(aligner, f"mapped_{genome}", "{cond}_{{rep}}", "{cond}_{{rep}}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), + ref_bw = OPJ(aligner, f"mapped_{genome}", "{ref}_{{rep}}", "{ref}_{{rep}}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), output: - bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{cond}_vs_{ref}_ratio", "{cond}_vs_{ref}_ratio_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), + bw = OPJ(aligner, f"mapped_{genome}", "{cond}_vs_{ref}_ratio", "{cond}_vs_{ref}_ratio_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), log: warnings = OPJ(log_dir, "make_bigwig_ratios", "{cond}_vs_{ref}_ratio_{read_type}_on_%s_by_{norm}_{orientation}.warnings" % genome), threads: 8 # to limit memory usage, actually @@ -2287,7 +2289,7 @@ rule select_gene_list_for_meta_profile: # input: # source_bigwig, # output: -# OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{orientation}_TSS.pdf"), +# OPJ("figures", "{lib}_{rep}", "{read_type}_{orientation}_TSS.pdf"), # log: # plot_TSS_log = OPJ(log_dir, "plot_TSS_profile", "{lib}_{rep}", "{read_type}_{orientation}.log"), # plot_TSS_err = OPJ(log_dir, "plot_TSS_profile", "{lib}_{rep}", "{read_type}_{orientation}.err"), @@ -2368,7 +2370,7 @@ def meta_params(wildcards): # bigwig = rules.make_bigwig.output.bigwig_norm, # bed = rules.select_genes_for_meta_profile.output.out_bed, # output: -# OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_meta_profile.pdf"), +# OPJ("figures", "{lib}_{rep}", "{read_type}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_meta_profile.pdf"), # params: # meta_params = meta_params, # # before = META_MARGIN, @@ -2402,10 +2404,10 @@ def meta_params(wildcards): # rule plot_meta_profile_mean: # input: # bigwig = rules.merge_bigwig_reps.output.bw, -# #bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{read_type}_on_%s_norm_{orientation}.bw" % genome), +# #bw = OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{read_type}_on_%s_norm_{orientation}.bw" % genome), # bed = rules.select_genes_for_meta_profile.output.out_bed # output: -# OPJ(output_dir, "figures", "{lib}_mean", "{read_type}_{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), +# OPJ("figures", "{lib}_mean", "{read_type}_{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), # params: # meta_params = meta_params, # # before = META_MARGIN, @@ -2440,7 +2442,7 @@ def get_libs_for_series(wildcards): def source_bigwigs(wildcards): return expand( - OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), lib=get_libs_for_series(wildcards), read_type=[wildcards.read_type], norm=wildcards.norm, orientation=wildcards.orientation) @@ -2456,12 +2458,12 @@ def make_y_label(wildcards): rule plot_biotype_mean_meta_profile: input: #bigwig = rules.merge_bigwig_reps.output.bw, - #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=LIBS), - #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=series_dict[wildcards.series_type][wildcards.series]), + #bws = expand(OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=LIBS), + #bws = expand(OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=series_dict[wildcards.series_type][wildcards.series]), bws = source_bigwigs, bed = rules.select_genes_for_meta_profile.output.out_bed output: - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.pdf"), + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{series_type}_{series}_meta_profile.pdf"), params: meta_params = meta_params, #labels = LIBS, @@ -2503,12 +2505,12 @@ rule plot_biotype_mean_meta_profile: rule plot_gene_list_mean_meta_profile: input: #bigwig = rules.merge_bigwig_reps.output.bw, - #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=LIBS), - #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=series_dict[wildcards.series_type][wildcards.series]), + #bws = expand(OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=LIBS), + #bws = expand(OPJ(aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=series_dict[wildcards.series_type][wildcards.series]), bws = source_bigwigs, bed = rules.select_gene_list_for_meta_profile.output.out_bed output: - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.pdf"), + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{series_type}_{series}_meta_profile.pdf"), params: meta_params = meta_params, #labels = LIBS, @@ -2576,10 +2578,10 @@ rule small_RNA_differential_expression: counts_table = source_counts, summary_table = rules.gather_read_counts_summaries.output.summary_table, output: - deseq_results = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_deseq2.txt"), - up_genes = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_up_genes.txt"), - down_genes = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_down_genes.txt"), - counts_and_res = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_counts_and_res.txt"), + deseq_results = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_deseq2.txt"), + up_genes = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_up_genes.txt"), + down_genes = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_down_genes.txt"), + counts_and_res = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_counts_and_res.txt"), log: warnings = OPJ(log_dir, "small_RNA_differential_expression", "{contrast}_{small_type}.warnings"), threads: 4 # to limit memory usage, actually @@ -2663,16 +2665,16 @@ rule gather_DE_folds: """Gathers DE folds across contrasts.""" input: fold_results = expand(OPJ( - output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", + aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{{small_type}}_counts_and_res.txt"), contrast=DE_CONTRASTS), output: all_folds = OPJ( - output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "{small_type}_{fold_type}.txt"), + aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "{small_type}_{fold_type}.txt"), log: log = OPJ(log_dir, "gather_DE_folds", "{small_type}_{fold_type}.log"), run: pd.DataFrame({contrast : pd.read_table( - OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", + OPJ(aligner, f"mapped_{genome}", f"deseq2_{size_selected}", f"{contrast}", f"{contrast}_{wildcards.small_type}_counts_and_res.txt"), index_col=["gene", "cosmid", "name", "small_type"], usecols=["gene", "cosmid", "name", "small_type", wildcards.fold_type])[wildcards.fold_type] for contrast in DE_CONTRASTS}).to_csv( @@ -2686,8 +2688,8 @@ rule plot_scatterplots: input: counts_and_res = rules.small_RNA_differential_expression.output.counts_and_res, output: - #pairplot = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_pairplot.pdf"), - pairplots = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_pairplots.pdf"), + #pairplot = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_pairplot.pdf"), + pairplots = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_pairplots.pdf"), run: counts_and_res = pd.read_table(input.counts_and_res, index_col="gene") (cond, ref) = CONTRAST2PAIR[wildcards.contrast] @@ -2741,7 +2743,7 @@ def source_fold_data(wildcards): if fold_type in {"log2FoldChange", "lfcMLE"}: if hasattr(wildcards, "contrast_type"): return expand( - OPJ(output_dir, aligner, f"mapped_{genome}", + OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{{small_type}}_counts_and_res.txt"), contrast=contrasts_dict[wildcards.contrast_type]) @@ -2752,7 +2754,7 @@ def source_fold_data(wildcards): #https://stackoverflow.com/a/26791923/1878788 #print("{0.small_type}".format(wildcards)) return [filename.format(wildcards) for filename in expand( - OPJ(output_dir, aligner, f"mapped_{genome}", + OPJ(aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{{0.small_type}}_RPM_folds.txt"), contrast=contrasts_dict[wildcards.contrast_type])] @@ -2774,7 +2776,7 @@ rule make_gene_list_lfc_boxplots: input: data = source_fold_data, output: - boxplots = OPJ(output_dir, "figures", "{contrast}", + boxplots = OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf") params: id_lists = set_id_lists, @@ -2794,7 +2796,7 @@ rule make_contrast_lfc_boxplots: input: data = source_fold_data, output: - boxplots = OPJ(output_dir, "figures", "all_{contrast_type}", + boxplots = OPJ("figures", "all_{contrast_type}", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf") params: id_lists = set_id_lists, @@ -2825,7 +2827,7 @@ rule compute_size_distribution: input: source_fastq output: - OPJ(output_dir, "read_stats", "{lib}_{rep}_{read_type}_size_distribution.txt") + OPJ("read_stats", "{lib}_{rep}_{read_type}_size_distribution.txt") message: "Computing read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." shell: @@ -2838,7 +2840,7 @@ rule plot_size_distribution: input: rules.compute_size_distribution.output output: - OPJ(output_dir, "figures", "{lib}_{{rep}}", "{read_type}_size_distribution.pdf"), + OPJ("figures", "{lib}_{{rep}}", "{read_type}_size_distribution.pdf"), message: "Plotting size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." run: @@ -3035,7 +3037,7 @@ rule plot_base_logo_along: input: data = rules.compute_base_composition_along.output.proportions, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"), message: "Plotting base logo from {wildcards.position} for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -3063,7 +3065,7 @@ rule plot_base_logo: input: data = rules.compute_base_proportions.output.data, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"), message: "Plotting {wildcards.position} base logo for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -3091,7 +3093,7 @@ rule plot_base_composition_along: input: data = rules.compute_base_composition_along.output.composition, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"), message: "Plotting base composition from {wildcards.position} for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -3119,7 +3121,7 @@ rule plot_base_composition: input: data = rules.compute_base_composition.output.data, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"), message: "Plotting {wildcards.position} base composition for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -3156,7 +3158,7 @@ rule plot_summary_barchart: input: rules.make_read_counts_summary.output output: - barchart = OPJ(output_dir, "figures", "{lib}_{rep}", "%s_smallRNA_barchart.pdf" % size_selected), + barchart = OPJ("figures", "{lib}_{rep}", "%s_smallRNA_barchart.pdf" % size_selected), message: "Plotting small RNA barchart for {wildcards.lib}_{wildcards.rep}_%s." % size_selected log: @@ -3275,8 +3277,8 @@ rule small_RNA_PCA: #counts_table = rules.gather_small_RNA_counts.output.counts_table, standardized_table = rules.standardize_counts.output.standardized_table, output: - pca = OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"), - components_distrib = OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), + pca = OPJ("figures", "{small_type}_{standard}_PCA.pdf"), + components_distrib = OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), threads: 8 # to limit memory usage, actually run: #counts_data = pd.read_table( @@ -3340,19 +3342,19 @@ rule small_RNA_PCA: rule explore_counts: input: counts_table = source_counts, - standardized_table = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "{small_type}_zscore.txt"), - normalized_table = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "{small_type}_unit.txt"), + standardized_table = OPJ(aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "{small_type}_zscore.txt"), + normalized_table = OPJ(aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "{small_type}_unit.txt"), summary_table = rules.gather_read_counts_summaries.output.summary_table, #standardized_table = rules.standardize_counts.output.standardized_table, - up_genes_list = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_up_genes.txt"), - down_genes_list = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes.txt"), - deseq_results = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2.txt"), + up_genes_list = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_up_genes.txt"), + down_genes_list = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes.txt"), + deseq_results = OPJ(aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2.txt"), output: - #pca = OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"), - #components_distrib = OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), - #clustermap = OPJ(output_dir, "figures", "{contrast}_{small_type}_clustermap.pdf"), - zscore_clustermap = OPJ(output_dir, "figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), - #unit_clustermap = OPJ(output_dir, "figures", "{contrast}_{small_type}_unit_clustermap.pdf"), + #pca = OPJ("figures", "{small_type}_{standard}_PCA.pdf"), + #components_distrib = OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), + #clustermap = OPJ("figures", "{contrast}_{small_type}_clustermap.pdf"), + zscore_clustermap = OPJ("figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), + #unit_clustermap = OPJ("figures", "{contrast}_{small_type}_unit_clustermap.pdf"), threads: 8 # to limit memory usage, actually run: counts_data = pd.read_table( @@ -3362,7 +3364,6 @@ rule explore_counts: # Gathering count summaries (for normalization) ################################################ #summary_files = (OPJ( - # output_dir, # aligner, # "summaries", # f"{cond_name}_{size_selected}_on_{genome}_read_counts.txt") for cond_name in COND_NAMES) diff --git a/run_pipeline.sh b/run_pipeline.sh index 82f264c48efd3d8bf0a50a615018298eaa7c5e78..ff7681f308c71c1db3eb4d7f67fd65f5136d1a5d 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -67,24 +67,32 @@ shift if [ -e ${configfile} ] then - kilobytes_tot=$(mawk '$1 == "MemTotal:" {print $2}' /proc/meminfo) - # Some rules were given a "mem_mb" resource section based on the "max_vms" benchmarking result. - # See the /pasteur/homes/bli/Documents/Informatique/benchmarks/Pipeline_benchmarking/Pipeline_benchmarking.ipynb jupyter notebook. - # These values are in megabytes (https://stackoverflow.com/a/47201241/1878788) - # We divide the total memory (in kB) by 1100 instead of 1000 - # to avoid pretending that we have all this memory available for snakemake rules. - megabytes_resource=$(echo "${kilobytes_tot} / 1100" | bc) - cmd="snakemake -s ${snakefile} --configfile ${configfile} --resources mem_mb=${megabytes_resource} $@" + echo "Pipeline configuration found: ${configfile}" else error_exit "Pipeline configuration file ${configfile} not found." fi # Determine the output directory and where to log the pipeline (fragile!) output_dir=$(grep "output_dir" "${configfile}" | mawk '{print $NF}' | sed 's/,$//' | sed 's/"//g') +mkdir -p ${output_dir} start_day=$(date +"%Y-%m-%d") -find_older_output="find ${output_dir} -depth ! -newermt ${start_day} -print" log_base="${output_dir}/$(date +"%d%m%y_%Hh%Mm")" -mkdir -p ${output_dir} + +config_base=$(basename ${configfile}) +config_snapshot="${output_dir}/${config_base}" +echo "Saving a local copy of the configuration in ${config_snapshot}" +cp -f ${configfile} ${config_snapshot} + +kilobytes_tot=$(mawk '$1 == "MemTotal:" {print $2}' /proc/meminfo) +# Some rules were given a "mem_mb" resource section based on the "max_vms" benchmarking result. +# See the /pasteur/homes/bli/Documents/Informatique/benchmarks/Pipeline_benchmarking/Pipeline_benchmarking.ipynb jupyter notebook. +# These values are in megabytes (https://stackoverflow.com/a/47201241/1878788) +# We divide the total memory (in kB) by 1100 instead of 1000 +# to avoid pretending that we have all this memory available for snakemake rules. +megabytes_resource=$(echo "${kilobytes_tot} / 1100" | bc) + +cmd="(cd ${output_dir}; snakemake -s ${snakefile} --configfile ${config_base} --resources mem_mb=${megabytes_resource} $@)" + echo ${cmd} > ${log_base}.log # https://unix.stackexchange.com/a/245610/55127 # https://stackoverflow.com/a/692407/1878788 @@ -93,7 +101,6 @@ echo ${cmd} > ${log_base}.log eval ${cmd} > >(tee -a ${log_base}.log) 2> >(tee -a ${log_base}.err >&2) || error_exit "${cmd} failed, see ${log_base}.err" end_day=$(date +"%Y-%m-%d") -#echo -e "This run started on ${start_day}.\nIf you want to find all older output, you can run the following command:\n${find_older_output}\n(Use -delete instead of -print to remove those files (do this only in case of full output update).)" 1>&2 echo -e "This run started on ${start_day} and ended on ${end_day}.\n" 1>&2 diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile index 1708687a37ebf76a3521787c86f2fc4beae5b78a..8c516108b200ccb36029a6f17f201a6354d8e286 100644 --- a/small_RNA-seq/small_RNA-seq.snakefile +++ b/small_RNA-seq/small_RNA-seq.snakefile @@ -405,12 +405,14 @@ gene_lists_dir = genome_dict["gene_lists_dir"] avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) index = genome_db -output_dir = config["output_dir"] -local_annot_dir = config.get("local_annot_dir", OPJ(output_dir, "annotations")) -log_dir = config.get("log_dir", OPJ(output_dir, "logs")) -data_dir = config.get("data_dir", OPJ(output_dir, "data")) +#output_dir = config["output_dir"] +#workdir: config["output_dir"] +output_dir = os.path.abspath(".") +local_annot_dir = config.get("local_annot_dir", OPJ("annotations")) +log_dir = config.get("log_dir", OPJ("logs")) +data_dir = config.get("data_dir", OPJ("data")) # To put the results of small_RNA_seq_annotate -mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}") +mapping_dir = OPJ(aligner, f"mapped_{genome}") reads_dir = OPJ(mapping_dir, "reads") annot_counts_dir = OPJ(mapping_dir, "annotation") # The order after pi and mi must match: this is used in make_read_counts_summary @@ -619,7 +621,7 @@ meta_profiles = [ #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split()), #expand(OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), type_set=["all", "protein_coding", "protein_coding_TE"], min_dist="0 5 10 25 50 100 250 500 1000 2500 5000 10000".split(), biotype=["protein_coding"], min_len=[str(META_MIN_LEN)]), [expand( - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"), meta_scale=[str(META_SCALE)], read_type=READ_TYPES_FOR_METAPROFILES, @@ -628,14 +630,14 @@ meta_profiles = [ group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()], # Same as above ? # [expand( - # OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + # OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", # "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"), # meta_scale=[str(META_SCALE)], read_type=[size_selected, "siRNA", "siuRNA", "miRNA", "piRNA", "all_siRNA"], # norm=SIZE_FACTORS, orientation=["all"], type_set=["protein_coding_TE"], min_dist=[str(MIN_DIST)], # biotype=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"], min_len=[str(META_MIN_LEN)], # group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()], [expand( - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"), meta_scale=[str(META_SCALE)], read_type=READ_TYPES_FOR_METAPROFILES, @@ -643,7 +645,7 @@ meta_profiles = [ biotype=["protein_coding"], min_len=[str(META_MIN_LEN)], group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()], [expand( - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{group_type}_{lib_group}_meta_profile.pdf"), meta_scale=[str(META_SCALE)], read_type=READ_TYPES_FOR_METAPROFILES, @@ -651,7 +653,7 @@ meta_profiles = [ group_type=[group_type], lib_group=list(lib_group.keys())) for (group_type, lib_group) in lib_groups.items()], ## TODO: Resolve issue with bedtools # expand( - # OPJ(output_dir, "figures", "{lib}_{rep}", + # OPJ("figures", "{lib}_{rep}", # "{read_type}_by_{norm}_{orientation}_pi_targets_in_{biotype}_profile.pdf"), # lib=LIBS, rep=REPS, read_type=READ_TYPES_FOR_METAPROFILES, # norm=SIZE_FACTORS, orientation=["all"], biotype=["protein_coding"]), @@ -660,34 +662,34 @@ meta_profiles = [ read_graphs = [ expand( expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_base_composition_from_{position}.pdf"), + OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_base_composition_from_{position}.pdf"), read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]), filtered_product, lib=LIBS, rep=REPS), expand( expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_base_logo_from_{position}.pdf"), + OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_base_logo_from_{position}.pdf"), read_type=READ_TYPES_FOR_COMPOSITION, position=["start", "end"]), filtered_product, lib=LIBS, rep=REPS), expand( expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_composition.pdf"), + OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_composition.pdf"), read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS), filtered_product, lib=LIBS, rep=REPS), expand( expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_logo.pdf"), + OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_{position}_base_logo.pdf"), read_type=READ_TYPES_FOR_COMPOSITION, position=POSITIONS), filtered_product, lib=LIBS, rep=REPS), expand( expand( - OPJ(output_dir, "figures", "{{lib}}_{{rep}}", "{read_type}_size_distribution.pdf"), + OPJ("figures", "{{lib}}_{{rep}}", "{read_type}_size_distribution.pdf"), read_type=["trimmed", "nomap"]), filtered_product, lib=LIBS, rep=REPS), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.pdf"), + OPJ("figures", "{lib}_{rep}", f"{size_selected}_smallRNA_barchart.pdf"), filtered_product, lib=LIBS, rep=REPS), expand( - OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"), + OPJ("figures", "{lib}_{rep}", "nb_reads.pdf"), filtered_product, lib=LIBS, rep=REPS), ] @@ -700,7 +702,7 @@ remapped_fold_boxplots = [] if contrasts_dict["ip"]: if BOXPLOT_BIOTYPES: remapped_fold_boxplots = expand( - OPJ(output_dir, "figures", "{contrast}", "by_biotypes", + OPJ("figures", "{contrast}", "by_biotypes", "{contrast}_{read_type}_{mapping_type}_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf"), contrast=IP_CONTRASTS, read_type=READ_TYPES_FOR_COUNTING, @@ -736,24 +738,24 @@ if contrasts_dict["ip"]: # "{contrast}_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_RPM_folds.txt"), # contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, mapping_type=[f"on_{genome}"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS), ip_fold_heatmaps = expand( - OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), + OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"]) ip_fold_boxplots = expand( - OPJ(output_dir, "figures", "all_{contrast_type}", + OPJ("figures", "all_{contrast_type}", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], gene_list=BOXPLOT_GENE_LISTS) # Subdivided by biotype (to enable distinction between 5'UTR, CDS and 3'UTR) ## TODO: activate this? # ip_fold_boxplots = expand( - # OPJ(output_dir, "figures", "all_{contrast_type}", + # OPJ("figures", "all_{contrast_type}", # "{contrast_type}_{small_type}_{fold_type}_{gene_list}_{biotype}_boxplots.pdf"), # contrast_type=["ip"], small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], # gene_list=BOXPLOT_GENE_LISTS, biotype=["protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]) ## if contrasts_dict["de"]: de_fold_heatmaps = expand( - OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), + OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), small_type=DE_TYPES, fold_type=["log2FoldChange"]) exploratory_graphs = [ @@ -765,30 +767,30 @@ exploratory_graphs = [ # contrast=DE_CONTRASTS, small_type=DE_TYPES, norm=DE_SIZE_FACTORS), ## TODO: debug PCA #expand( - # OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"), + # OPJ("figures", "{small_type}_{standard}_PCA.pdf"), # small_type=["pisimi"], standard=STANDARDS), #expand( - # OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), + # OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), # small_type=["pisimi"], standard=STANDARDS), ## #expand( - # OPJ(output_dir, "figures", "{small_type}_clustermap.pdf"), + # OPJ("figures", "{small_type}_clustermap.pdf"), # small_type=SMALL_TYPES), #expand( - # OPJ(output_dir, "figures", "{small_type}_zscore_clustermap.pdf"), + # OPJ("figures", "{small_type}_zscore_clustermap.pdf"), # small_type=SMALL_TYPES), #expand( - # OPJ(output_dir, "figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), + # OPJ("figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), # contrast=CONTRASTS, small_type=DE_TYPES), ] de_fold_boxplots = expand( - OPJ(output_dir, "figures", "{contrast}", + OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), contrast=DE_CONTRASTS, small_type=DE_TYPES, fold_type=["log2FoldChange", "mean_log2_RPM_fold"], gene_list=["all_gene_lists"]) ip_fold_boxplots_by_contrast = expand( - OPJ(output_dir, "figures", "{contrast}", + OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf"), contrast=IP_CONTRASTS, small_type=IP_TYPES, fold_type=["mean_log2_RPM_fold"], gene_list=["all_gene_lists"]) @@ -848,7 +850,7 @@ rule all: remapped_folds, fold_boxplots, remapped_fold_boxplots, - #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES), + #expand(OPJ("figures", "{small_type}_unit_clustermap.pdf"), small_type=SMALL_TYPES), #expand(OPJ( # feature_counts_dir, # "all_{read_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"), read_type=READ_TYPES_FOR_COUNTING, mapping_type=[f"on_{genome}"], biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS), @@ -862,10 +864,10 @@ rule all: # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot expand( - OPJ(output_dir, "figures", "{small_type}_norm_correlations.pdf"), + OPJ("figures", "{small_type}_norm_correlations.pdf"), small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]), expand( - OPJ(output_dir, "figures", "{small_type}_norm_counts_distrib.pdf"), + OPJ("figures", "{small_type}_norm_counts_distrib.pdf"), small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]), #absolute = "/pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/includes/link_raw_data.rules" @@ -993,7 +995,7 @@ rule map_on_genome: fastq = rules.select_size_range.output.selected, output: sam = temp(OPJ(mapping_dir, "{lib}_{rep}", "%s_on_%s.sam" % (size_selected, genome))), - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_on_%s.fastq.gz" % (size_selected, genome)), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_on_%s.fastq.gz" % (size_selected, genome)), params: aligner = aligner, index = genome_db, @@ -1016,11 +1018,11 @@ rule map_on_genome: rule extract_nomap_siRNAs: """Extract from the non-mappers those that end in poly-T, and could be mappable after T-tail trimming.""" input: - OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_on_%s.fastq.gz" % (size_selected, genome)), + OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_on_%s.fastq.gz" % (size_selected, genome)), #rules.map_on_genome.output.nomap_fastq, output: - nomap_si = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_siRNA.fastq.gz" % size_selected), - nb_nomap_si = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_siRNA.txt" % size_selected), + nomap_si = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_siRNA.fastq.gz" % size_selected), + nb_nomap_si = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_%s_unmapped_siRNA.txt" % size_selected), message: "Extracting G.{{%d,%d}}T+ reads from unmapped {wildcards.lib}_{wildcards.rep}_%s." % (SI_MIN - 2, SI_MAX - 1, size_selected) shell: @@ -1043,7 +1045,7 @@ rule remap_on_genome: fastq = source_fastq, output: sam = temp(OPJ(mapping_dir, "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)), - nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), + nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), # Here we don't map size_selected wildcard_constraints: read_type = "|".join(set(READ_TYPES_FOR_MAPPING + READ_TYPES_FOR_COUNTING) - {size_selected}) @@ -1542,7 +1544,7 @@ rule plot_read_numbers: nb_mapped = rules.small_RNA_seq_annotate.output.nb_mapped, nb_mappedU = rules.small_RNA_seq_annotate_U.output.nb_mapped, output: - plot = OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.pdf"), + plot = OPJ("figures", "{lib}_{rep}", "nb_reads.pdf"), message: "Plotting read numbers for {wildcards.lib}_{wildcards.rep}." # Currently not used @@ -1998,7 +2000,7 @@ rule make_read_counts_summary: # mi_counts = rules.small_RNA_seq_annotate.output.mi_counts, output: summary = OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", "{lib}_{rep}_%s_on_%s_read_counts.txt" % (size_selected, genome)), benchmark: OPJ(log_dir, "make_read_counts_summary", "{lib}_{rep}_benchmark.txt"), @@ -2083,15 +2085,15 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: rule gather_read_counts_summaries: input: summary_tables = expand(OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", "{name}_%s_on_%s_read_counts.txt" % (size_selected, genome)), name=COND_NAMES), output: summary_table = OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", "all_%s_on_%s_read_counts.txt" % (size_selected, genome)), run: summary_files = (OPJ( - output_dir, aligner, "summaries", + aligner, "summaries", f"{cond_name}_{size_selected}_on_{genome}_read_counts.txt") for cond_name in COND_NAMES) summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1) summaries.columns = COND_NAMES @@ -2412,7 +2414,7 @@ rule make_fold_heatmap: input: all_folds = source_gathered_folds, output: - fold_heatmap = OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), + fold_heatmap = OPJ("figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"), log: warnings = OPJ(log_dir, "make_fold_heatmap", "{small_type}_{fold_type}.warnings"), benchmark: @@ -2484,8 +2486,8 @@ rule test_size_factor: counts_table = source_small_RNA_counts, summary_table = rules.gather_read_counts_summaries.output.summary_table, output: - norm_correlations = OPJ(output_dir, "figures", "{small_type}_norm_correlations.pdf"), - norm_counts_distrib_plot = OPJ(output_dir, "figures", "{small_type}_norm_counts_distrib.pdf"), + norm_correlations = OPJ("figures", "{small_type}_norm_correlations.pdf"), + norm_counts_distrib_plot = OPJ("figures", "{small_type}_norm_counts_distrib.pdf"), params: size_factor_types = TESTED_SIZE_FACTORS, norm_corr_plot_title = lambda wildcards: f"Correlation across samples between normalized {wildcards.small_type}RNA counts and size factors.", @@ -3224,7 +3226,7 @@ rule select_gene_list_for_meta_profile: # input: # source_bigwig, # output: -# OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{orientation}_TSS.pdf"), +# OPJ("figures", "{lib}_{rep}", "{read_type}_{orientation}_TSS.pdf"), # log: # plot_TSS_log = OPJ(log_dir, "plot_TSS_profile", "{lib}_{rep}", "{read_type}_{orientation}.log"), # plot_TSS_err = OPJ(log_dir, "plot_TSS_profile", "{lib}_{rep}", "{read_type}_{orientation}.err"), @@ -3305,7 +3307,7 @@ def meta_params(wildcards): # bigwig = rules.make_bigwig.output.bigwig_norm, # bed = rules.select_genes_for_meta_profile.output.out_bed, # output: -# OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_meta_profile.pdf"), +# OPJ("figures", "{lib}_{rep}", "{read_type}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_meta_profile.pdf"), # params: # meta_params = meta_params, # # before = META_MARGIN, @@ -3342,7 +3344,7 @@ def meta_params(wildcards): # #bw = OPJ(mapping_dir, "{lib}_mean", "{read_type}_on_%s_norm_{orientation}.bw" % genome), # bed = rules.select_genes_for_meta_profile.output.out_bed # output: -# OPJ(output_dir, "figures", "{lib}_mean", "{read_type}_{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), +# OPJ("figures", "{lib}_mean", "{read_type}_{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.pdf" % (MIN_DIST, META_MIN_LEN)), # params: # meta_params = meta_params, # # before = META_MARGIN, @@ -3398,7 +3400,7 @@ rule plot_biotype_mean_meta_profile: bws = source_bigwigs, bed = rules.select_genes_for_meta_profile.output.out_bed output: - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{biotype}_min_{min_len}_{group_type}_{lib_group}_meta_profile.pdf"), params: meta_params = meta_params, @@ -3451,7 +3453,7 @@ rule plot_gene_list_mean_meta_profile: bws = source_bigwigs, bed = rules.select_gene_list_for_meta_profile.output.out_bed output: - OPJ(output_dir, "figures", "mean_meta_profiles_meta_scale_{meta_scale}", + OPJ("figures", "mean_meta_profiles_meta_scale_{meta_scale}", "{read_type}_by_{norm}_{orientation}_on_{type_set}_merged_isolated_{min_dist}_{id_list}_{group_type}_{lib_group}_meta_profile.pdf"), params: meta_params = meta_params, @@ -3567,7 +3569,7 @@ rule plot_pi_targets_profile: bigwig = rules.make_normalized_bigwig.output.bigwig_norm, bed = rules.locate_pi_targets.output.bed, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_by_{norm}_{orientation}_pi_targets_in_{biotype}_profile.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_by_{norm}_{orientation}_pi_targets_in_{biotype}_profile.pdf"), params: y_label = make_y_label, log: @@ -3858,7 +3860,7 @@ rule make_gene_list_lfc_boxplots: input: data = source_fold_data, output: - boxplots = OPJ(output_dir, "figures", "{contrast}", + boxplots = OPJ("figures", "{contrast}", "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf") log: warnings = OPJ(log_dir, "make_gene_list_lfc_boxplots", "{contrast}_{small_type}_{fold_type}_{gene_list}.warnings"), @@ -3907,7 +3909,7 @@ rule make_contrast_lfc_boxplots: input: data = source_fold_data, output: - boxplots = OPJ(output_dir, "figures", "all_{contrast_type}", + boxplots = OPJ("figures", "all_{contrast_type}", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf") log: warnings = OPJ(log_dir, "make_contrast_lfc_boxplots", "{contrast_type}_{small_type}_{fold_type}_{gene_list}_boxplots.warnings"), @@ -3972,7 +3974,7 @@ rule make_biotype_lfc_boxplots: #print_wc, data = source_fold_data, output: - boxplots = OPJ(output_dir, "figures", "{contrast}", "by_biotypes", + boxplots = OPJ("figures", "{contrast}", "by_biotypes", "{contrast}_{read_type}_{mapping_type}_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots.pdf") benchmark: OPJ(log_dir, "make_biotype_lfc_boxplots", "{contrast}_{read_type}_{mapping_type}_{biotypes}_{orientation}_transcript_{fold_type}_{gene_list}_boxplots_benchmark.txt") @@ -4004,7 +4006,7 @@ rule compute_size_distribution: input: source_fastq output: - OPJ(output_dir, "read_stats", "{lib}_{rep}_{read_type}_size_distribution.txt") + OPJ("read_stats", "{lib}_{rep}_{read_type}_size_distribution.txt") message: "Computing read size distribution for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." shell: @@ -4017,7 +4019,7 @@ rule plot_size_distribution: input: rules.compute_size_distribution.output output: - OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_size_distribution.pdf") + OPJ("figures", "{lib}_{rep}", "{read_type}_size_distribution.pdf") message: "Plotting size distribution for trimmed {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." run: @@ -4218,7 +4220,7 @@ rule plot_base_logo_along: input: data = rules.compute_base_composition_along.output.proportions, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_base_logo_from_{position}.pdf"), message: "Plotting base logo from {wildcards.position} for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -4251,7 +4253,7 @@ rule plot_base_logo: input: data = rules.compute_base_proportions.output.data, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_{position}_base_logo.pdf"), message: "Plotting {wildcards.position} base logo for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -4283,7 +4285,7 @@ rule plot_base_composition_along: input: data = rules.compute_base_composition_along.output.composition, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_base_composition_from_{position}.pdf"), message: "Plotting base composition from {wildcards.position} for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -4316,7 +4318,7 @@ rule plot_base_composition: input: data = rules.compute_base_composition.output.data, output: - figure = OPJ(output_dir, "figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"), + figure = OPJ("figures", "{lib}_{rep}", "{read_type}_{position}_base_composition.pdf"), message: "Plotting {wildcards.position} base composition for {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep}." log: @@ -4357,7 +4359,7 @@ rule plot_summary_barchart: input: rules.make_read_counts_summary.output, output: - barchart = OPJ(output_dir, "figures", "{lib}_{rep}", "%s_smallRNA_barchart.pdf" % size_selected), + barchart = OPJ("figures", "{lib}_{rep}", "%s_smallRNA_barchart.pdf" % size_selected), message: "Plotting small RNA barchart for {wildcards.lib}_{wildcards.rep}_%s." % size_selected log: @@ -4512,8 +4514,8 @@ rule small_RNA_PCA: #counts_table = rules.gather_small_RNA_counts.output.counts_table, standardized_table = rules.standardize_counts.output.standardized_table, output: - pca = OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"), - components_distrib = OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), + pca = OPJ("figures", "{small_type}_{standard}_PCA.pdf"), + components_distrib = OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), threads: 8 # to limit memory usage, actually run: #counts_data = pd.read_table( @@ -4585,11 +4587,11 @@ rule explore_counts: down_genes_list = OPJ(mapping_dir, "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes.txt"), deseq_results = OPJ(mapping_dir, "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2.txt"), output: - #pca = OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.pdf"), - #components_distrib = OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), - #clustermap = OPJ(output_dir, "figures", "{contrast}_{small_type}_clustermap.pdf"), - zscore_clustermap = OPJ(output_dir, "figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), - #unit_clustermap = OPJ(output_dir, "figures", "{contrast}_{small_type}_unit_clustermap.pdf"), + #pca = OPJ("figures", "{small_type}_{standard}_PCA.pdf"), + #components_distrib = OPJ("figures", "{small_type}_{standard}_PC1_PC2_distrib.pdf"), + #clustermap = OPJ("figures", "{contrast}_{small_type}_clustermap.pdf"), + zscore_clustermap = OPJ("figures", "{contrast}", "{small_type}_zscore_clustermap.pdf"), + #unit_clustermap = OPJ("figures", "{contrast}_{small_type}_unit_clustermap.pdf"), threads: 8 # to limit memory usage, actually run: counts_data = pd.read_table(