diff --git a/Ribo-seq/Ribo-seq.snakefile b/Ribo-seq/Ribo-seq.snakefile index 47cc90d201f312fe584d13bddf88de6f941a8d55..8591dc509a53c262b6bc811b3e14766ffdd83ebd 100644 --- a/Ribo-seq/Ribo-seq.snakefile +++ b/Ribo-seq/Ribo-seq.snakefile @@ -292,7 +292,7 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign( #SIZE_FACTORS = ["raw", "deduped", size_selected, "mapped", "siRNA", "miRNA"] #SIZE_FACTORS = [size_selected, "mapped", "miRNA"] #TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA", "median_ratio_to_pseudo_ref"] -TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA"] +TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA", "RPF"] #SIZE_FACTORS = ["mapped", "miRNA", "median_ratio_to_pseudo_ref"] # "median_ratio_to_pseudo_ref" is a size factor adapted from # the method described in the DESeq paper, but with addition @@ -301,7 +301,7 @@ TESTED_SIZE_FACTORS = ["mapped", "non_structural", "siRNA", "miRNA"] #DE_SIZE_FACTORS = ["non_structural", "median_ratio_to_pseudo_ref"] DE_SIZE_FACTORS = ["non_structural"] #SIZE_FACTORS = ["non_structural", "median_ratio_to_pseudo_ref"] -SIZE_FACTORS = ["non_structural"] +SIZE_FACTORS = ["non_structural", "RPF"] NORMALIZER = "median_ratio_to_pseudo_ref" # For metagene analyses @@ -385,20 +385,20 @@ bigwig_files = [ # individual libraries expand( expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", - "{lib}_{rep}_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + "{lib}_{rep}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), filtered_product, lib=LIBS, rep=REPS), read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), #expand( # expand( - # OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{lib}_{rep}_{{read_type}}_on_C_elegans_{{orientation}}.bw"), + # OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}.bw" % genome), # filtered_product, lib=LIBS, rep=REPS), # read_type=READ_TYPES_FOR_MAPPING, orientation=ORIENTATIONS), # means of replicates expand( expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", - "{lib}_mean_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), filtered_product, lib=LIBS), read_type=READ_TYPES_FOR_MAPPING, norm=SIZE_FACTORS, orientation=["all"]), ] @@ -489,7 +489,7 @@ exploratory_graphs = [ fold_heatmaps, # Large figures, not very readable expand( - OPJ(output_dir, aligner, "mapped_C_elegans", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"), + OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"), contrast=DE_CONTRASTS, small_type=DE_TYPES, fig_format=FIG_FORMATS), ## TODO: debug PCA #expand( @@ -526,15 +526,18 @@ fold_boxplots = [de_fold_boxplots, ip_fold_boxplots_by_contrast, ip_fold_boxplot rule all: input: expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_nb_mapped.txt"), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), filtered_product, lib=LIBS, rep=REPS, read_type=[size_selected]), expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_coverage.txt"), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), filtered_product, lib=LIBS, rep=REPS, read_type=[size_selected]), + expand( + OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.{fig_format}"), + filtered_product, lib=LIBS, rep=REPS, fig_format=FIG_FORMATS), expand( expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", - "{{lib}}_{{rep}}_%s_on_C_elegans_{orientation}_counts.txt" % size_selected), + OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + "{{lib}}_{{rep}}_%s_on_%s_{orientation}_counts.txt" % (size_selected, genome)), orientation=ORIENTATIONS), filtered_product, lib=LIBS, rep=REPS), bigwig_files, @@ -542,17 +545,17 @@ rule all: # rule future_all: # meta_profiles, # read_graphs, -# #expand(OPJ(output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", "{lib}_{rep}_nb_non_structural.txt"), filtered_product, lib=LIBS, rep=REPS), -# OPJ(output_dir, aligner, "mapped_C_elegans", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), -# expand(OPJ(output_dir, aligner, "mapped_C_elegans", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), -# #expand(OPJ(output_dir, aligner, "mapped_C_elegans", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES), +# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", "{lib}_{rep}_nb_non_structural.txt"), filtered_product, lib=LIBS, rep=REPS), +# OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "pisimi_mean_log2_RPM_fold.txt"), +# expand(OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]), +# #expand(OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES), # exploratory_graphs, # fold_boxplots, # #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS), # # piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting # # simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot # expand( -# OPJ(counts_dir, f"all_{size_selected}_on_C_elegans", "{small_type}_RPM.txt"), +# OPJ(counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), # small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"]), # expand( # OPJ(output_dir, "figures", "{small_type}_norm_correlations.{fig_format}"), @@ -599,6 +602,38 @@ rule trim_and_dedup: """ +rule trim: + input: + rules.link_raw_data.output, + #OPJ(data_dir, "{lib}_{rep}.fastq.gz"), + params: + adapter = lambda wildcards : lib2adapt[wildcards.lib], + trim5 = trim5, + trim3 = trim3, + output: + trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_trimmed.fastq.gz"), + nb_raw = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_raw.txt"), + nb_trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"), + #threads: 2 + message: + "Trimming adaptor from raw data, deduplicating reads, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}." + benchmark: + OPJ(log_dir, "trim", "{lib}_{rep}_benchmark.txt") + log: + cutadapt = OPJ(log_dir, "cutadapt", "{lib}_{rep}.log"), + trim = OPJ(log_dir, "trim", "{lib}_{rep}.log"), + shell: + """ + zcat {input} \\ + | tee >(count_fastq_reads {output.nb_raw}) \\ + | cutadapt -a {params.adapter} --discard-untrimmed - 2> {log.cutadapt} \\ + | tee >(count_fastq_reads {output.nb_trimmed}) \\ + | trim_random_nt {params.trim5} {params.trim3} 2>> {log.cutadapt} \\ + | gzip > {output.trimmed} \\ + 2> {log.trim_and_dedup} + """ + + def awk_size_filter(wildcards): """Returns the bioawk filter to select reads of size from MIN_LEN to MAX_LEN.""" return "%s <= length($seq) && length($seq) <= %s" % (MIN_LEN, MAX_LEN) @@ -645,8 +680,8 @@ rule map_on_genome: # fastq = rules.select_size_range.output.selected, fastq = source_fastq, output: - sam = temp(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans.sam")), - nomap_fastq = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_{read_type}_unmapped_on_C_elegans.fastq.gz"), + sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)), + nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome), params: aligner = aligner, index = index, @@ -667,22 +702,22 @@ rule map_on_genome: def source_sam(wildcards): if hasattr(wildcards, "read_type"): return OPJ( - output_dir, aligner, "mapped_C_elegans", + output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", - f"{wildcards.read_type}_on_C_elegans.sam") + f"{wildcards.read_type}_on_{genome}.sam") else: return OPJ( - output_dir, aligner, "mapped_C_elegans", + output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", - f"{size_selected}_on_C_elegans.sam") + f"{size_selected}_on_{genome}.sam") rule sam2indexedbam: input: sam = source_sam, output: - sorted_bam = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_sorted.bam"), - index = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_sorted.bam.bai"), + sorted_bam = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam" % genome), + index = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_sorted.bam.bai" % genome), message: "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type}." benchmark: @@ -700,7 +735,7 @@ rule compute_mapping_stats: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, output: - stats = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_samtools_stats.txt"), + stats = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_samtools_stats.txt" % genome), shell: """samtools stats {input.sorted_bam} > {output.stats}""" @@ -710,7 +745,7 @@ rule get_nb_mapped: input: stats = rules.compute_mapping_stats.output.stats, output: - nb_mapped = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_nb_mapped.txt"), + nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_nb_mapped.txt" % genome), shell: """samstats2mapped {input.stats} {output.nb_mapped}""" @@ -719,7 +754,7 @@ rule compute_coverage: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, output: - coverage = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans_coverage.txt"), + coverage = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_coverage.txt" % genome), params: genomelen = genomelen, shell: @@ -732,13 +767,13 @@ rule compute_coverage: rule extract_RPF: input: sorted_bam = OPJ( - output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", - "%s_on_C_elegans_sorted.bam" % size_selected), + output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", + f"{size_selected}_on_{genome}_sorted.bam"), protein_gtf = OPJ(annot_dir, "protein_coding.gtf") output: - rpf = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "%s_on_C_elegans_fwd_on_protein_coding.fastq.gz" % size_selected) + rpf = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.fastq.gz") params: - tmp_filtered_bam = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "%s_on_C_elegans_fwd_on_protein_coding.bam" % size_selected), + tmp_filtered_bam = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_protein_coding.bam"), shell: """ mkfifo {params.tmp_filtered_bam} @@ -762,10 +797,10 @@ def feature_orientation2stranded(wildcards): def source_bams(wildcards): """We can count from several bam files with feature_count.""" - read_types = wildcards.read_types.split("_and_") + read_types = wildcards.read_type.split("_and_") sorted_bams = [OPJ( - output_dir, aligner, "mapped_C_elegans", f"{wildcards.lib}_{wildcards.rep}", - f"{read_type}_on_C_elegans_sorted.bam") for read_type in read_types] + output_dir, aligner, f"mapped_{genome}", f"{wildcards.lib}_{wildcards.rep}", + f"{read_type}_on_{genome}_sorted.bam") for read_type in read_types] return sorted_bams @@ -774,22 +809,22 @@ rule feature_count_reads: source_bams, output: counts = OPJ( - output_dir, aligner, "mapped_C_elegans", "feature_count", - "{lib}_{rep}_{read_types}_on_C_elegans_{biotype}_{orientation}_counts.txt"), + output_dir, aligner, f"mapped_{genome}", "feature_count", + "{lib}_{rep}_{read_type}_on_%s_{biotype}_{orientation}_counts.txt" % genome), params: stranded = feature_orientation2stranded, + annot = lambda wildcards: OPJ(annot_dir, f"{wildcards.biotype}.gtf") message: - "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_types} with featureCounts." + "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} with featureCounts." benchmark: - OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_types}_{biotype}_{orientation}_benchmark.txt"), + OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}_benchmark.txt"), log: - log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_types}_{biotype}_{orientation}.log"), - err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_types}_{biotype}_{orientation}.err"), + log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.log"), + err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.err"), shell: """ - tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_types}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX") - annot="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/{wildcards.biotype}.gtf" - cmd="featureCounts -a ${{annot}} -o {output.counts} -t transcript -g "gene_id" -O -s {params.stranded} --fracOverlap 1 --tmpDir ${{tmpdir}} {input}" + tmpdir=$(TMPDIR=/var/tmp mktemp --tmpdir -d "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.read_type}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX") + cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -s {params.stranded} --fracOverlap 1 --tmpDir ${{tmpdir}} {input}" featureCounts -v 2> {log.log} echo ${{cmd}} 1>> {log.log} eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed" @@ -801,17 +836,17 @@ rule summarize_feature_counts: """For a given library, compute the total counts for each biotype and write this in a summary table.""" input: expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "feature_count", - "{{lib}}_{{rep}}_{{read_types}}_on_C_elegans_{biotype}_{{orientation}}_counts.txt"), + OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", + "{{lib}}_{{rep}}_{{read_type}}_on_%s_{biotype}_{{orientation}}_counts.txt" % genome), biotype=COUNT_BIOTYPES), output: - summary = OPJ(output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", - "{lib}_{rep}_{read_types}_on_C_elegans_{orientation}_counts.txt") + summary = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + "{lib}_{rep}_{read_type}_on_%s_{orientation}_counts.txt" % genome) run: with open(output.summary, "w") as summary_file: header = "\t".join(COUNT_BIOTYPES) summary_file.write("#biotypes\t%s\n" % header) - sums = "\t".join((str(sum_feature_counts(counts_file, nb_bams=len(wildcards.read_types.split("_and_")))) for counts_file in input)) + sums = "\t".join((str(sum_feature_counts(counts_file, nb_bams=len(wildcards.read_type.split("_and_")))) for counts_file in input)) summary_file.write(f"%s_%s_%s\t%s\n" % (wildcards.lib, wildcards.rep, wildcards.orientation, sums)) @@ -854,14 +889,14 @@ rule count_non_structural_mappers: input: # nb_mapped is the total number of size-selected that mapped #nb_mapped = rules.get_nb_mapped.output.nb_mapped, - nb_mapped = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "%s_on_C_elegans_nb_mapped.txt" % size_selected), + nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"), # structural are fwd to STRUCTURAL_BIOTYPES summary = OPJ( - output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", - "{lib}_{rep}_%s_on_C_elegans_fwd_counts.txt" % size_selected), + output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + "{lib}_{rep}_%s_on_%s_fwd_counts.txt" % (size_selected, genome)), output: nb_non_structural = OPJ( - output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", + output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", "{lib}_{rep}_nb_non_structural.txt"), run: nb_mapped = read_int_from_file(input.nb_mapped) @@ -870,34 +905,6 @@ rule count_non_structural_mappers: out_file.write("%d\n" % (nb_mapped - structural)) -rule plot_read_numbers: - input: - nb_raw = rules.trim_and_dedup.output.nb_raw, - nb_deduped = rules.trim_and_dedup.output.nb_deduped, - nb_trimmed = rules.trim_and_dedup.output.nb_trimmed, - nb_selected = rules.select_size_range.output.nb_selected, - nb_mapped = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "%s_on_C_elegans_nb_mapped.txt" % size_selected), - output: - plot = OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.{fig_format}"), - message: - "Plotting read numbers for {wildcards.lib}_{wildcards.rep}." - # Currently not used - #log: - # log = OPJ(log_dir, "plot_read_numbers", "{lib}_{rep}.log"), - # err = OPJ(log_dir, "plot_read_numbers", "{lib}_{rep}.err") - run: - name = f"{wildcards.lib}_{wildcards.rep}" - summary = pd.Series({ - "nb_raw" : read_int_from_file(input.nb_raw), - "nb_deduped" : read_int_from_file(input.nb_deduped), - "nb_trimmed" : read_int_from_file(input.nb_trimmed), - "nb_selected" : read_int_from_file(input.nb_selected), - "nb_mapped" : read_int_from_file(input.nb_mapped)}) - # Chose column order: - summary = summary[["nb_raw", "nb_deduped", "nb_trimmed", "nb_selected", "nb_mapped"]] - save_plot(output.plot, plot_barchart, summary, title="%s reads" % name) - - # import re # from statistics import mean # def log2data(line): @@ -915,12 +922,12 @@ rule plot_read_numbers: rule count_RPF: input: - summary_table = OPJ(output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", - "{lib}_{rep}_%s_on_C_elegans_fwd_counts.txt" % size_selected) + summary_table = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + "{lib}_{rep}_%s_on_%s_fwd_counts.txt" % (size_selected, genome)) output: # TODO: - nb_RPF = OPJ(output_dir, aligner, "mapped_C_elegans", "feature_count", "summaries", - "{lib}_{rep}_%s_on_C_elegans_fwd_nb_RPF.txt" % size_selected) + nb_RPF = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", + "{lib}_{rep}_%s_on_%s_fwd_nb_RPF.txt" % (size_selected, genome)) run: nb_RPF = pd.read_table(input.summary_table, index_col=0)["protein_coding"][f"{wildcards.lib}_{wildcards.rep}_fwd"] with open(output.nb_RPF, "w") as nb_RPF_file: @@ -935,6 +942,37 @@ def source_counts(wildcards): raise NotImplementedError +rule plot_read_numbers: + input: + nb_raw = rules.trim_and_dedup.output.nb_raw, + #nb_deduped = rules.trim_and_dedup.output.nb_deduped, + nb_trimmed = rules.trim_and_dedup.output.nb_trimmed, + nb_selected = rules.select_size_range.output.nb_selected, + nb_mapped = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_nb_mapped.txt"), + nb_RPF = rules.count_RPF.output.nb_RPF, + output: + plot = OPJ(output_dir, "figures", "{lib}_{rep}", "nb_reads.{fig_format}"), + message: + "Plotting read numbers for {wildcards.lib}_{wildcards.rep}." + # Currently not used + #log: + # log = OPJ(log_dir, "plot_read_numbers", "{lib}_{rep}.log"), + # err = OPJ(log_dir, "plot_read_numbers", "{lib}_{rep}.err") + run: + name = f"{wildcards.lib}_{wildcards.rep}" + summary = pd.Series({ + "nb_raw" : read_int_from_file(input.nb_raw), + # "nb_deduped" : read_int_from_file(input.nb_deduped), + "nb_trimmed" : read_int_from_file(input.nb_trimmed), + "nb_selected" : read_int_from_file(input.nb_selected), + "nb_mapped" : read_int_from_file(input.nb_mapped), + "nb_RPF" : read_int_from_file(input.nb_RPF)}) + # Chose column order: + #summary = summary[["nb_raw", "nb_deduped", "nb_trimmed", "nb_selected", "nb_mapped", "nb_RPF"]] + summary = summary[["nb_raw", "nb_trimmed", "nb_selected", "nb_mapped", "nb_RPF"]] + save_plot(output.plot, plot_barchart, summary, title="%s reads" % name) + + # Generate report with # - raw # - trimmed @@ -947,7 +985,7 @@ def source_counts(wildcards): rule make_read_counts_summary: input: nb_raw = rules.trim_and_dedup.output.nb_raw, - nb_deduped = rules.trim_and_dedup.output.nb_deduped, + # nb_deduped = rules.trim_and_dedup.output.nb_deduped, nb_trimmed = rules.trim_and_dedup.output.nb_trimmed, nb_size_selected = rules.select_size_range.output.nb_selected, nb_mapped = rules.get_nb_mapped.output.nb_mapped, @@ -956,7 +994,7 @@ rule make_read_counts_summary: output: summary = OPJ( output_dir, aligner, "summaries", - "{lib}_{rep}_{read_type}_on_C_elegans_read_counts.txt"), + "{lib}_{rep}_{read_type}_on_%s_read_counts.txt" % genome), #threads: 8 # to limit memory usage, actually run: with open(output.summary, "w") as summary_file: @@ -967,8 +1005,8 @@ rule make_read_counts_summary: summary_file.write("\t") summary_file.write("%d" % read_int_from_file(input.nb_trimmed)) summary_file.write("\t") - summary_file.write("%d" % read_int_from_file(input.nb_deduped)) - summary_file.write("\t") + # summary_file.write("%d" % read_int_from_file(input.nb_deduped)) + # summary_file.write("\t") summary_file.write("%d" % read_int_from_file(input.nb_size_selected)) summary_file.write("\t") summary_file.write("%d" % read_int_from_file(input.nb_mapped)) @@ -984,7 +1022,7 @@ rule compute_median_ratio_to_pseudo_ref_size_factors: counts_table = source_counts, output: median_ratios_file = OPJ( - counts_dir, "all_%s_on_C_elegans" % size_selected, + counts_dir, f"all_{size_selected}_on_{genome}", "{read_type}_median_ratios_to_pseudo_ref.txt"), run: counts_data = pd.read_table( @@ -1004,15 +1042,15 @@ rule gather_read_counts_summaries: input: summary_tables = expand(OPJ( output_dir, aligner, "summaries", - "{name}_%s_on_C_elegans_read_counts.txt" % size_selected), name=COND_NAMES), + "{name}_%s_on_%s_read_counts.txt" % (size_selected, genome)), name=COND_NAMES), output: summary_table = OPJ( output_dir, aligner, "summaries", - "all_%s_on_C_elegans_read_counts.txt" % size_selected), + f"all_{size_selected}_on_{genome}_read_counts.txt"), run: summary_files = (OPJ( output_dir, aligner, "summaries", - f"{cond_name}_{size_selected}_on_C_elegans_read_counts.txt") for cond_name in COND_NAMES) + f"{cond_name}_{size_selected}_on_{genome}_read_counts.txt") for cond_name in COND_NAMES) summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1) summaries.columns = COND_NAMES summaries.to_csv(output.summary_table, sep="\t") @@ -1046,7 +1084,7 @@ rule compute_RPM: output: RPM_table = OPJ( counts_dir, - f"all_{size_selected}_on_C_elegans", "{small_type}_RPM.txt"), + f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"), log: log = OPJ(log_dir, "compute_RPM", "{small_type}.log"), run: @@ -1070,7 +1108,7 @@ rule compute_RPM_folds: summary_table = rules.gather_read_counts_summaries.output.summary_table, output: fold_results = OPJ( - output_dir, aligner, "mapped_C_elegans", f"RPM_folds_{size_selected}", + output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), log: log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{small_type}.log"), @@ -1120,16 +1158,16 @@ rule gather_RPM_folds: """Gathers RPM folds across contrasts.""" input: fold_results = expand(OPJ( - output_dir, aligner, "mapped_C_elegans", f"RPM_folds_{size_selected}", + output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "{contrast}", "{contrast}_{{small_type}}_RPM_folds.txt"), contrast=IP_CONTRASTS), output: all_folds = OPJ( - output_dir, aligner, "mapped_C_elegans", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), + output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"), log: log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"), run: pd.DataFrame({contrast : pd.read_table( - OPJ(output_dir, aligner, "mapped_C_elegans", f"RPM_folds_{size_selected}", + OPJ(output_dir, aligner, f"mapped_{genome}", f"RPM_folds_{size_selected}", f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"), index_col=["gene", "cosmid", "name", "small_type"], usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv( @@ -1349,7 +1387,7 @@ rule standardize_counts: #counts_table = rules.gather_small_RNA_counts.output.counts_table, counts_table = source_counts, output: - standardized_table = OPJ(counts_dir, "all_%s_on_C_elegans" % size_selected, "{small_type}_{standard}.txt"), + standardized_table = OPJ(counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_{standard}.txt"), run: counts_data = pd.read_table( input.counts_table, @@ -1396,53 +1434,6 @@ def bamcoverage_filter(wildcards): return "" -# Note: It seems that running several bamCoverage at the same time (using snakemake -j <n>) -# can result in strange failures, like the following: -# Traceback (most recent call last): -# File "/home/bli/.local/bin/bamCoverage", line 11, in <module> -# main(args) -# File "/home/bli/.local/lib/python3.6/site-packages/deeptools/bamCoverage.py", line 255, in main -# format=args.outFileFormat, smoothLength=args.smoothLength) -# File "/home/bli/.local/lib/python3.6/site-packages/deeptools/writeBedGraph.py", line 164, in run -# chrom_names_and_size, bedgraph_file, out_file_name, True) -# File "/home/bli/.local/lib/python3.6/site-packages/deeptools/writeBedGraph.py", line 296, in bedGraphToBigWig -# f = open("{}.sorted".format(_file.name)) -# FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmpsgluqboy.sorted -rule make_bigwig: - input: - bam = rules.sam2indexedbam.output.sorted_bam, - output: - bigwig = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{lib}_{rep}_{read_type}_on_C_elegans_{orientation}.bw"), - params: - orient_filter = bamcoverage_filter, - threads: 8 # to limit memory usage, actually - log: - log = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{read_type}_{orientation}.log"), - err = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{read_type}_{orientation}.err"), - run: - # TODO: make this a function of deeptools version - no_reads = """Error: The generated bedGraphFile was empty. Please adjust -your deepTools settings and check your input files. -""" -# no_reads = """[bwClose] There was an error while finishing writing a bigWig file! The output is likely truncated. -#""" - try: - shell(""" - cmd="bamCoverage -b {input.bam} --skipNAs {params.orient_filter} \\ - -of=bigwig -bs 10 -p={threads} \\ - -o {output.bigwig} \\ - 1>> {log.log} 2>> {log.err}" - echo ${{cmd}} >> {log.log} - eval ${{cmd}} || error_exit "bamCoverage failed" - """) - except CalledProcessError as e: - if last_lines(log.err, 2) == no_reads: - with open(output.bigwig, "w") as bwfile: - bwfile.write("") - else: - raise - - def source_norm_file(wildcards): return rules.make_read_counts_summary.output.summary @@ -1452,9 +1443,9 @@ rule make_normalized_bigwig: input: bam = rules.sam2indexedbam.output.sorted_bam, norm_file = source_norm_file, - #median_ratios_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "all_si_median_ratios_to_pseudo_ref.txt"), + #median_ratios_file = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "all_si_median_ratios_to_pseudo_ref.txt"), output: - bigwig_norm = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{lib}_{rep}_{read_type}_on_C_elegans_by_{norm}_{orientation}.bw"), + bigwig_norm = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), params: #orient_filter = bamcoverage_filter, genome_binned = genome_binned, @@ -1529,11 +1520,11 @@ bam2bigwig.sh: bedGraphToBigWig failed rule merge_bigwig_reps: """This rule merges bigwig files by computing a mean across replicates.""" input: - expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{{lib}}_{rep}", "{{lib}}_{rep}_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), rep=REPS), + expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{{lib}}_{rep}", "{{lib}}_{rep}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), rep=REPS), output: - bw = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{lib}_mean_{read_type}_on_C_elegans_by_{norm}_{orientation}.bw"), + bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), log: - warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_{read_type}_on_C_elegans_by_{norm}_{orientation}.warnings"), + warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.warnings" % genome), threads: 2 # to limit memory usage, actually run: with warn_context(log.warnings) as warn: @@ -1575,12 +1566,12 @@ rule merge_bigwig_reps: rule make_bigwig_ratios: """This rule tries to make a bigwig file displaying the ratio between a condition and a reference, from the corresponding two normalized bigwig files.""" input: - cond_bw = OPJ(output_dir, aligner, "mapped_C_elegans", "{cond}_{{rep}}", "{cond}_{{rep}}_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), - ref_bw = OPJ(output_dir, aligner, "mapped_C_elegans", "{ref}_{{rep}}", "{ref}_{{rep}}_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), + cond_bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{cond}_{{rep}}", "{cond}_{{rep}}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), + ref_bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{ref}_{{rep}}", "{ref}_{{rep}}_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), output: - bw = OPJ(output_dir, aligner, "mapped_C_elegans", "{cond}_vs_{ref}_ratio", "{cond}_vs_{ref}_ratio_{read_type}_on_C_elegans_by_{norm}_{orientation}.bw"), + bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{cond}_vs_{ref}_ratio", "{cond}_vs_{ref}_ratio_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), log: - warnings = OPJ(log_dir, "make_bigwig_ratios", "{cond}_vs_{ref}_ratio_{read_type}_on_C_elegans_by_{norm}_{orientation}.warnings"), + warnings = OPJ(log_dir, "make_bigwig_ratios", "{cond}_vs_{ref}_ratio_{read_type}_on_%s_by_{norm}_{orientation}.warnings" % genome), threads: 8 # to limit memory usage, actually run: with warn_context(log.warnings) as warn: @@ -1774,6 +1765,7 @@ rule adjust_TSS: out_bed = OPJ(local_annot_dir, "transcripts_{type_set}", "with_TSS.bed"), run: TSS_dict = defaultdict(set) + # TODO: This should be configurable with open("/pasteur/entites/Mhe/Genomes/C_elegans/TSS_annotations/Kruesi_TSS_coding_WT_L3_ce11_sorted.bed", "r") as TSS_bedfile: for chrom, bed_start, _, gene_info in map(strip_split, TSS_bedfile): # chrI 35384 35385 WBGene00022279|sesn-1@chrI:27595-32482|-1 @@ -1897,6 +1889,7 @@ rule select_genes_for_meta_profile: out_bed = OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{biotype}_min_{min_len}.bed"), run: TSS_dict = defaultdict(set) + # TODO: This should be configurable with open("/pasteur/entites/Mhe/Genomes/C_elegans/TSS_annotations/Kruesi_TSS_coding_WT_L3_ce11_sorted.bed", "r") as TSS_bedfile: for chrom, bed_start, _, gene_info in map(strip_split, TSS_bedfile): # chrI 35384 35385 WBGene00022279|sesn-1@chrI:27595-32482|-1 @@ -1955,6 +1948,7 @@ rule select_gene_list_for_meta_profile: out_bed = OPJ(local_annot_dir, "transcripts_{type_set}", "merged_isolated_{min_dist}_{id_list}.bed"), run: TSS_dict = defaultdict(set) + # TODO: this should be configurable with open("/pasteur/entites/Mhe/Genomes/C_elegans/TSS_annotations/Kruesi_TSS_coding_WT_L3_ce11_sorted.bed", "r") as TSS_bedfile: for chrom, bed_start, _, gene_info in map(strip_split, TSS_bedfile): # chrI 35384 35385 WBGene00022279|sesn-1@chrI:27595-32482|-1 @@ -2107,7 +2101,7 @@ def meta_params(wildcards): # rule plot_meta_profile_mean: # input: # bigwig = rules.merge_bigwig_reps.output.bw, -# #bw = OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{read_type}_on_C_elegans_norm_{orientation}.bw"), +# #bw = OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{read_type}_on_%s_norm_{orientation}.bw" % genome), # bed = rules.select_genes_for_meta_profile.output.out_bed # output: # OPJ(output_dir, "figures", "{lib}_mean", "{read_type}_{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), @@ -2145,8 +2139,8 @@ def get_libs_for_series(wildcards): def source_bigwigs(wildcards): return expand( - OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", - "{lib}_mean_{read_type}_on_C_elegans_by_{norm}_{orientation}.bw"), + OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", + "{lib}_mean_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), lib=get_libs_for_series(wildcards), read_type=[wildcards.read_type], norm=wildcards.norm, orientation=wildcards.orientation) @@ -2161,8 +2155,8 @@ def make_y_label(wildcards): rule plot_biotype_mean_meta_profile: input: #bigwig = rules.merge_bigwig_reps.output.bw, - #bws = expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{lib}_mean_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), lib=LIBS), - #bws = expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{lib}_mean_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), lib=series_dict[wildcards.series_type][wildcards.series]), + #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=LIBS), + #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=series_dict[wildcards.series_type][wildcards.series]), bws = source_bigwigs, bed = rules.select_genes_for_meta_profile.output.out_bed output: @@ -2208,8 +2202,8 @@ rule plot_biotype_mean_meta_profile: rule plot_gene_list_mean_meta_profile: input: #bigwig = rules.merge_bigwig_reps.output.bw, - #bws = expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{lib}_mean_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), lib=LIBS), - #bws = expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{lib}_mean_{{read_type}}_on_C_elegans_by_{{norm}}_{{orientation}}.bw"), lib=series_dict[wildcards.series_type][wildcards.series]), + #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=LIBS), + #bws = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_mean", "{lib}_mean_{{read_type}}_on_%s_by_{{norm}}_{{orientation}}.bw" % genome), lib=series_dict[wildcards.series_type][wildcards.series]), bws = source_bigwigs, bed = rules.select_gene_list_for_meta_profile.output.out_bed output: @@ -2281,10 +2275,10 @@ rule small_RNA_differential_expression: counts_table = source_counts, summary_table = rules.gather_read_counts_summaries.output.summary_table, output: - deseq_results = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_deseq2.txt"), - up_genes = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_up_genes.txt"), - down_genes = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_down_genes.txt"), - counts_and_res = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_counts_and_res.txt"), + deseq_results = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_deseq2.txt"), + up_genes = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_up_genes.txt"), + down_genes = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_down_genes.txt"), + counts_and_res = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_counts_and_res.txt"), log: warnings = OPJ(log_dir, "small_RNA_differential_expression", "{contrast}_{small_type}.warnings"), threads: 4 # to limit memory usage, actually @@ -2368,16 +2362,16 @@ rule gather_DE_folds: """Gathers DE folds across contrasts.""" input: fold_results = expand(OPJ( - output_dir, aligner, "mapped_C_elegans", f"deseq2_{size_selected}", + output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "{contrast}", "{contrast}_{{small_type}}_counts_and_res.txt"), contrast=DE_CONTRASTS), output: all_folds = OPJ( - output_dir, aligner, "mapped_C_elegans", f"deseq2_{size_selected}", "all", "{small_type}_{fold_type}.txt"), + output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", "all", "{small_type}_{fold_type}.txt"), log: log = OPJ(log_dir, "gather_DE_folds", "{small_type}.log"), run: pd.DataFrame({contrast : pd.read_table( - OPJ(output_dir, aligner, "mapped_C_elegans", f"deseq2_{size_selected}", + OPJ(output_dir, aligner, f"mapped_{genome}", f"deseq2_{size_selected}", f"{contrast}", f"{contrast}_{wildcards.small_type}_counts_and_res.txt"), index_col=["gene", "cosmid", "name", "small_type"], usecols=["gene", "cosmid", "name", "small_type", wildcards.fold_type])[wildcards.fold_type] for contrast in DE_CONTRASTS}).to_csv( @@ -2391,8 +2385,8 @@ rule plot_scatterplots: input: counts_and_res = rules.small_RNA_differential_expression.output.counts_and_res, output: - #pairplot = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_pairplot.pdf"), - pairplots = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"), + #pairplot = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_pairplot.pdf"), + pairplots = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_pairplots.{fig_format}"), run: counts_and_res = pd.read_table(input.counts_and_res, index_col="gene") (cond, ref) = CONTRAST2PAIR[wildcards.contrast] @@ -2430,29 +2424,6 @@ rule plot_scatterplots: #sns.pairplot(counts_and_res, vars=COND_NAMES, hue="status") -#rule convert_wormids: -# input: -# #counts_table_in = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "{small_type}_counts.txt"), -# deseq_results_in = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2.txt"), -# counts_and_res_in = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_counts_and_res.txt"), -# up_genes_in = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_up_genes.txt"), -# down_genes_in = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes.txt"), -# output: -# #counts_table_out = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "{small_type}_counts_{id_type}.txt"), -# deseq_results_out = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2_{id_type}.txt"), -# counts_and_res_out = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_counts_and_res_{id_type}.txt"), -# up_genes_out = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_up_genes_{id_type}.txt"), -# down_genes_out = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes_{id_type}.txt"), -# run: -# with open(OPJ(convert_dir, "wormid2{id_type}.pickle".format(id_type=wildcards.id_type)), "rb") as dict_file: -# converter = load(dict_file) -# for infile_name, outfile_name in zip(input, output): -# with open(infile_name, "r") as infile, open(outfile_name, "w") as outfile: -# for fields in map(strip_split, infile): -# outfile.write("%s\n" % "\t".join((converter.get(field, field) for field in fields))) - - - # takes wildcards, gene list name or path # returns list of wormbase ids get_id_list = make_id_list_getter(gene_lists_dir, avail_id_lists) @@ -2469,7 +2440,7 @@ def source_fold_data(wildcards): if fold_type in {"log2FoldChange", "lfcMLE"}: if hasattr(wildcards, "contrast_type"): return expand( - OPJ(output_dir, aligner, "mapped_C_elegans", + OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{contrast}_{{small_type}}_counts_and_res.txt"), contrast=contrasts_dict[wildcards.contrast_type]) @@ -2480,7 +2451,7 @@ def source_fold_data(wildcards): #https://stackoverflow.com/a/26791923/1878788 #print("{0.small_type}".format(wildcards)) return [filename.format(wildcards) for filename in expand( - OPJ(output_dir, aligner, "mapped_C_elegans", + OPJ(output_dir, aligner, f"mapped_{genome}", "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{{0.small_type}}_RPM_folds.txt"), contrast=contrasts_dict[wildcards.contrast_type])] @@ -3067,13 +3038,13 @@ rule small_RNA_PCA: rule explore_counts: input: counts_table = source_counts, - standardized_table = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "{small_type}_zscore.txt"), - normalized_table = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "{small_type}_unit.txt"), + standardized_table = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "{small_type}_zscore.txt"), + normalized_table = OPJ(output_dir, aligner, f"mapped_{genome}", "annotation", f"all_{size_selected}_on_{genome}", "{small_type}_unit.txt"), summary_table = rules.gather_read_counts_summaries.output.summary_table, #standardized_table = rules.standardize_counts.output.standardized_table, - up_genes_list = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_up_genes.txt"), - down_genes_list = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes.txt"), - deseq_results = OPJ(output_dir, aligner, "mapped_C_elegans", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2.txt"), + up_genes_list = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_up_genes.txt"), + down_genes_list = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_down_genes.txt"), + deseq_results = OPJ(output_dir, aligner, f"mapped_{genome}", "deseq2_%s" % size_selected, "{contrast}", "{small_type}_deseq2.txt"), output: #pca = OPJ(output_dir, "figures", "{small_type}_{standard}_PCA.{fig_format}"), #components_distrib = OPJ(output_dir, "figures", "{small_type}_{standard}_PC1_PC2_distrib.{fig_format}"), @@ -3092,9 +3063,7 @@ rule explore_counts: # output_dir, # aligner, # "summaries", - # "{name}_{read_type}_on_C_elegans_read_counts.txt".format( - # name=cond_name, - # read_type=size_selected)) for cond_name in COND_NAMES) + # f"{cond_name}_{size_selected}_on_{genome}_read_counts.txt") for cond_name in COND_NAMES) ## summaries = dict(zip(COND_NAMES, (pd.read_table(summary_file).squeeze().astype(int) for summary_file in summary_files))) ## summaries = pd.concat((summaries[cond_name] for cond_name in COND_NAMES), axis=1) #summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)