Skip to content
Snippets Groups Projects
Commit 26aafc43 authored by Blaise Li's avatar Blaise Li
Browse files

Cleanup and backup code, factorizing function.

parent d192a6e2
No related branches found
No related tags found
No related merge requests found
......@@ -27,7 +27,7 @@ import pandas as pd
import matplotlib.pyplot as plt
from libhts import median_ratio_to_pseudo_ref_size_factors
from libworkflows import get_chrom_sizes
from libworkflows import get_chrom_sizes, cleanup_and_backup
from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded, read_feature_counts, sum_feature_counts
from smincludes import rules as irules
......@@ -650,6 +650,7 @@ bam2bigwig.sh: bedGraphToBigWig failed
onsuccess:
print("iCLIP data pre-processing finished.")
cleanup_and_backup(config)
onerror:
print("iCLIP data pre-processing failed.")
......
......@@ -47,7 +47,7 @@ import seaborn as sns
import husl
from libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
......@@ -77,6 +77,7 @@ LFC_RANGE = {
"DNA_transposons_rmsk_families" : (-10, 10),
"RNA_transposons_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"pseudogene" : (-10, 10),
"alltypes" : (-10, 10)}
DE_BIOTYPES = list(LFC_RANGE.keys())
# Cutoffs in log fold change
......@@ -1440,6 +1441,7 @@ rule plot_meta_profile_mean:
onsuccess:
print("PRO-seq analysis finished.")
cleanup_and_backup(config)
onerror:
print("PRO-seq analysis failed.")
......
......@@ -62,9 +62,10 @@ from rpy2.robjects import Formula, StrVector
from libhts import do_deseq2, status_setter, plot_lfc_distribution, plot_MA
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution
from libworkflows import ensure_relative
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_intersect_counts, sum_intersect_counts
from libworkflows import read_feature_counts, sum_feature_counts
......@@ -86,6 +87,7 @@ LFC_RANGE = {
"DNA_transposons_rmsk_families" : (-10, 10),
"RNA_transposons_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"pseudogene" : (-10, 10),
"alltypes" : (-10, 10)}
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
......@@ -550,31 +552,6 @@ rule htseq_count_reads:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/htseq_count_reads"
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(wildcards):
orientation = wildcards.orientation
if orientation == "fwd":
if LIB_TYPE[-2:] == "SF":
return 1
elif LIB_TYPE[-2:] == "SR":
return 2
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "rev":
if LIB_TYPE[-2:] == "SF":
return 2
elif LIB_TYPE[-2:] == "SR":
return 1
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "all":
return 0
else:
exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
rule feature_count_reads:
input:
sorted_bam = rules.sam2indexedbam.output.sorted_bam,
......@@ -583,7 +560,7 @@ rule feature_count_reads:
counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
params:
stranded = feature_orientation2stranded,
stranded = feature_orientation2stranded(LIB_TYPE),
annot = biotype2annot,
# pickled dictionary that associates gene ids to gene names
converter = genome_dict["converter"]
......@@ -949,7 +926,7 @@ rule make_bigwig:
params:
genome_binned = genome_binned,
# orient_filter = genomecov_filter,
#threads: 12 # to limit memory usage, actually
threads: 12 # to limit memory usage, actually
log:
log = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{orientation}.log"),
err = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{orientation}.err"),
......@@ -1372,6 +1349,7 @@ rule make_MA_plot:
onsuccess:
print("RNA-seq analysis finished.")
cleanup_and_backup(config)
onerror:
print("RNA-seq analysis failed.")
from .libworkflows import SHELL_FUNCTIONS, column_converter, ensure_relative, file_len, filter_combinator, get_chrom_sizes, last_lines, make_id_list_getter, read_int_from_file, read_feature_counts, read_htseq_counts, read_intersect_counts, save_plot, strip_split, sum_feature_counts, sum_htseq_counts, sum_intersect_counts, test_na_file, warn_context
from .libworkflows import (
SHELL_FUNCTIONS, cleanup_and_backup, column_converter, ensure_relative,
file_len, feature_orientation2stranded, filter_combinator, get_chrom_sizes,
last_lines, make_id_list_getter, read_int_from_file, read_feature_counts,
read_htseq_counts, read_intersect_counts, save_plot, strip_split,
sum_feature_counts, sum_htseq_counts, sum_intersect_counts, test_na_file,
warn_context)
import os
import sys
from glob import glob
from shutil import rmtree
import warnings
from contextlib import contextmanager
from subprocess import Popen, PIPE
......@@ -92,6 +93,24 @@ def ensure_relative(path, basedir):
else:
return path
def cleanup_and_backup(config):
"""Performs cleanup and backup according to the information present in the
*config* dictionary."""
print("removing metadata")
# https://stackoverflow.com/a/45614282/1878788
rmtree(".snakemake/metadata")
if config.get("backup", {}):
user = config["backup"]["user"]
if user == "USER":
user = os.environ[user]
host = config["backup"]["host"]
# If no dest_dir, we assume the directory structure
# is the same on the destination host
dest_dir = config["backup"].get("dest_dir", os.getcwd())
print(f"backuping results to {user}@{host}:{dest_dir}")
shell(f"rsync -vaP {output_dir} {user}@{host}:{dest_dir}")
def read_int_from_file(filename):
"""Just reads a single integer from a file."""
with open(filename, "r") as f:
......@@ -122,6 +141,33 @@ def get_chrom_sizes(filename):
genome_size_file, "html5lib").sequencesizes.findAll("chromosome")])
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(LIB_TYPE):
def feature_stranded(wildcards):
orientation = wildcards.orientation
if orientation == "fwd":
if LIB_TYPE[-2:] == "SF":
return 1
elif LIB_TYPE[-2:] == "SR":
return 2
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "rev":
if LIB_TYPE[-2:] == "SF":
return 2
elif LIB_TYPE[-2:] == "SR":
return 1
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "all":
return 0
else:
exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
return feature_stranded
def sum_htseq_counts(counts_filename):
with open(counts_filename) as counts_file:
return sum((int(fields[1]) for fields in map(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment