diff --git a/CLIP/iCLIP.snakefile b/CLIP/iCLIP.snakefile index 385431bc3c9d5488608dd16a95d671dd72cfc0e6..b1f47770d7aba3d7f0693feed82952ba20cf4609 100644 --- a/CLIP/iCLIP.snakefile +++ b/CLIP/iCLIP.snakefile @@ -27,7 +27,7 @@ import pandas as pd import matplotlib.pyplot as plt from libhts import median_ratio_to_pseudo_ref_size_factors -from libworkflows import get_chrom_sizes +from libworkflows import get_chrom_sizes, cleanup_and_backup from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context from libworkflows import feature_orientation2stranded, read_feature_counts, sum_feature_counts from smincludes import rules as irules @@ -650,6 +650,7 @@ bam2bigwig.sh: bedGraphToBigWig failed onsuccess: print("iCLIP data pre-processing finished.") + cleanup_and_backup(config) onerror: print("iCLIP data pre-processing failed.") diff --git a/PRO-seq/PRO-seq.snakefile b/PRO-seq/PRO-seq.snakefile index 9a18c5aacad928724d4c991e9c273bcd46ace6a9..e2687e8a5e03e47b59718abfc2f9c88b0ed16a86 100644 --- a/PRO-seq/PRO-seq.snakefile +++ b/PRO-seq/PRO-seq.snakefile @@ -47,7 +47,7 @@ import seaborn as sns import husl from libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA -from libworkflows import ensure_relative +from libworkflows import ensure_relative, cleanup_and_backup from libworkflows import get_chrom_sizes, column_converter from libworkflows import strip_split, last_lines, save_plot, test_na_file from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context @@ -77,6 +77,7 @@ LFC_RANGE = { "DNA_transposons_rmsk_families" : (-10, 10), "RNA_transposons_rmsk_families" : (-10, 10), "simple_repeats_rmsk_families" : (-10, 10), + "pseudogene" : (-10, 10), "alltypes" : (-10, 10)} DE_BIOTYPES = list(LFC_RANGE.keys()) # Cutoffs in log fold change @@ -1440,6 +1441,7 @@ rule plot_meta_profile_mean: onsuccess: print("PRO-seq analysis finished.") + cleanup_and_backup(config) onerror: print("PRO-seq analysis failed.") diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 8b33ab64e3f61894773f58d089ce0b817136ec21..18dfd1022a2fff087befbf22f966aaa58cfc440c 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -62,9 +62,10 @@ from rpy2.robjects import Formula, StrVector from libhts import do_deseq2, status_setter, plot_lfc_distribution, plot_MA from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution -from libworkflows import ensure_relative +from libworkflows import ensure_relative, cleanup_and_backup from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context +from libworkflows import feature_orientation2stranded from libworkflows import read_htseq_counts, sum_htseq_counts from libworkflows import read_intersect_counts, sum_intersect_counts from libworkflows import read_feature_counts, sum_feature_counts @@ -86,6 +87,7 @@ LFC_RANGE = { "DNA_transposons_rmsk_families" : (-10, 10), "RNA_transposons_rmsk_families" : (-10, 10), "simple_repeats_rmsk_families" : (-10, 10), + "pseudogene" : (-10, 10), "alltypes" : (-10, 10)} # Cutoffs in log fold change LFC_CUTOFFS = [0.5, 1, 2] @@ -550,31 +552,6 @@ rule htseq_count_reads: "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/htseq_count_reads" -# NOTE: It is not clear that this really does what I think it does: -# https://www.biostars.org/p/161534/ -# https://www.biostars.org/p/170406/ -def feature_orientation2stranded(wildcards): - orientation = wildcards.orientation - if orientation == "fwd": - if LIB_TYPE[-2:] == "SF": - return 1 - elif LIB_TYPE[-2:] == "SR": - return 2 - else: - raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.") - elif orientation == "rev": - if LIB_TYPE[-2:] == "SF": - return 2 - elif LIB_TYPE[-2:] == "SR": - return 1 - else: - raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.") - elif orientation == "all": - return 0 - else: - exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".") - - rule feature_count_reads: input: sorted_bam = rules.sam2indexedbam.output.sorted_bam, @@ -583,7 +560,7 @@ rule feature_count_reads: counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"), counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"), params: - stranded = feature_orientation2stranded, + stranded = feature_orientation2stranded(LIB_TYPE), annot = biotype2annot, # pickled dictionary that associates gene ids to gene names converter = genome_dict["converter"] @@ -949,7 +926,7 @@ rule make_bigwig: params: genome_binned = genome_binned, # orient_filter = genomecov_filter, - #threads: 12 # to limit memory usage, actually + threads: 12 # to limit memory usage, actually log: log = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{orientation}.log"), err = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{orientation}.err"), @@ -1372,6 +1349,7 @@ rule make_MA_plot: onsuccess: print("RNA-seq analysis finished.") + cleanup_and_backup(config) onerror: print("RNA-seq analysis failed.") diff --git a/libworkflows/libworkflows/__init__.py b/libworkflows/libworkflows/__init__.py index 2c08aaee7e394b2155920e08f5e7a76d7df0cbe3..40e1360bcfdac7164a60133bc3677b1afa0260cc 100644 --- a/libworkflows/libworkflows/__init__.py +++ b/libworkflows/libworkflows/__init__.py @@ -1 +1,7 @@ -from .libworkflows import SHELL_FUNCTIONS, column_converter, ensure_relative, file_len, filter_combinator, get_chrom_sizes, last_lines, make_id_list_getter, read_int_from_file, read_feature_counts, read_htseq_counts, read_intersect_counts, save_plot, strip_split, sum_feature_counts, sum_htseq_counts, sum_intersect_counts, test_na_file, warn_context +from .libworkflows import ( + SHELL_FUNCTIONS, cleanup_and_backup, column_converter, ensure_relative, + file_len, feature_orientation2stranded, filter_combinator, get_chrom_sizes, + last_lines, make_id_list_getter, read_int_from_file, read_feature_counts, + read_htseq_counts, read_intersect_counts, save_plot, strip_split, + sum_feature_counts, sum_htseq_counts, sum_intersect_counts, test_na_file, + warn_context) diff --git a/libworkflows/libworkflows/libworkflows.py b/libworkflows/libworkflows/libworkflows.py index 1b1328e04341577ab9d3b08240bfdc9b771f5d57..49590cfcbd47e21cec9c8537006f846cb397e04a 100644 --- a/libworkflows/libworkflows/libworkflows.py +++ b/libworkflows/libworkflows/libworkflows.py @@ -1,6 +1,7 @@ import os import sys from glob import glob +from shutil import rmtree import warnings from contextlib import contextmanager from subprocess import Popen, PIPE @@ -92,6 +93,24 @@ def ensure_relative(path, basedir): else: return path +def cleanup_and_backup(config): + """Performs cleanup and backup according to the information present in the + *config* dictionary.""" + print("removing metadata") + # https://stackoverflow.com/a/45614282/1878788 + rmtree(".snakemake/metadata") + if config.get("backup", {}): + user = config["backup"]["user"] + if user == "USER": + user = os.environ[user] + host = config["backup"]["host"] + # If no dest_dir, we assume the directory structure + # is the same on the destination host + dest_dir = config["backup"].get("dest_dir", os.getcwd()) + print(f"backuping results to {user}@{host}:{dest_dir}") + shell(f"rsync -vaP {output_dir} {user}@{host}:{dest_dir}") + + def read_int_from_file(filename): """Just reads a single integer from a file.""" with open(filename, "r") as f: @@ -122,6 +141,33 @@ def get_chrom_sizes(filename): genome_size_file, "html5lib").sequencesizes.findAll("chromosome")]) +# NOTE: It is not clear that this really does what I think it does: +# https://www.biostars.org/p/161534/ +# https://www.biostars.org/p/170406/ +def feature_orientation2stranded(LIB_TYPE): + def feature_stranded(wildcards): + orientation = wildcards.orientation + if orientation == "fwd": + if LIB_TYPE[-2:] == "SF": + return 1 + elif LIB_TYPE[-2:] == "SR": + return 2 + else: + raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.") + elif orientation == "rev": + if LIB_TYPE[-2:] == "SF": + return 2 + elif LIB_TYPE[-2:] == "SR": + return 1 + else: + raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.") + elif orientation == "all": + return 0 + else: + exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".") + return feature_stranded + + def sum_htseq_counts(counts_filename): with open(counts_filename) as counts_file: return sum((int(fields[1]) for fields in map(