Commit 26aafc43 authored by Blaise Li's avatar Blaise Li
Browse files

Cleanup and backup code, factorizing function.

parent d192a6e2
......@@ -27,7 +27,7 @@ import pandas as pd
import matplotlib.pyplot as plt
from libhts import median_ratio_to_pseudo_ref_size_factors
from libworkflows import get_chrom_sizes
from libworkflows import get_chrom_sizes, cleanup_and_backup
from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded, read_feature_counts, sum_feature_counts
from smincludes import rules as irules
......@@ -650,6 +650,7 @@ bam2bigwig.sh: bedGraphToBigWig failed
onsuccess:
print("iCLIP data pre-processing finished.")
cleanup_and_backup(config)
onerror:
print("iCLIP data pre-processing failed.")
......
......@@ -47,7 +47,7 @@ import seaborn as sns
import husl
from libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
from libworkflows import ensure_relative
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
......@@ -77,6 +77,7 @@ LFC_RANGE = {
"DNA_transposons_rmsk_families" : (-10, 10),
"RNA_transposons_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"pseudogene" : (-10, 10),
"alltypes" : (-10, 10)}
DE_BIOTYPES = list(LFC_RANGE.keys())
# Cutoffs in log fold change
......@@ -1440,6 +1441,7 @@ rule plot_meta_profile_mean:
onsuccess:
print("PRO-seq analysis finished.")
cleanup_and_backup(config)
onerror:
print("PRO-seq analysis failed.")
......
......@@ -62,9 +62,10 @@ from rpy2.robjects import Formula, StrVector
from libhts import do_deseq2, status_setter, plot_lfc_distribution, plot_MA
from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations
from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_distribution
from libworkflows import ensure_relative
from libworkflows import ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_intersect_counts, sum_intersect_counts
from libworkflows import read_feature_counts, sum_feature_counts
......@@ -86,6 +87,7 @@ LFC_RANGE = {
"DNA_transposons_rmsk_families" : (-10, 10),
"RNA_transposons_rmsk_families" : (-10, 10),
"simple_repeats_rmsk_families" : (-10, 10),
"pseudogene" : (-10, 10),
"alltypes" : (-10, 10)}
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
......@@ -550,31 +552,6 @@ rule htseq_count_reads:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/htseq_count_reads"
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(wildcards):
orientation = wildcards.orientation
if orientation == "fwd":
if LIB_TYPE[-2:] == "SF":
return 1
elif LIB_TYPE[-2:] == "SR":
return 2
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "rev":
if LIB_TYPE[-2:] == "SF":
return 2
elif LIB_TYPE[-2:] == "SR":
return 1
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "all":
return 0
else:
exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
rule feature_count_reads:
input:
sorted_bam = rules.sam2indexedbam.output.sorted_bam,
......@@ -583,7 +560,7 @@ rule feature_count_reads:
counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts.txt"),
counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"{{lib}}_{{rep}}_on_{genome}", "{biotype}_{orientation}_counts_gene_names.txt"),
params:
stranded = feature_orientation2stranded,
stranded = feature_orientation2stranded(LIB_TYPE),
annot = biotype2annot,
# pickled dictionary that associates gene ids to gene names
converter = genome_dict["converter"]
......@@ -949,7 +926,7 @@ rule make_bigwig:
params:
genome_binned = genome_binned,
# orient_filter = genomecov_filter,
#threads: 12 # to limit memory usage, actually
threads: 12 # to limit memory usage, actually
log:
log = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{orientation}.log"),
err = OPJ(log_dir, "make_bigwig", "{lib}_{rep}_{orientation}.err"),
......@@ -1372,6 +1349,7 @@ rule make_MA_plot:
onsuccess:
print("RNA-seq analysis finished.")
cleanup_and_backup(config)
onerror:
print("RNA-seq analysis failed.")
from .libworkflows import SHELL_FUNCTIONS, column_converter, ensure_relative, file_len, filter_combinator, get_chrom_sizes, last_lines, make_id_list_getter, read_int_from_file, read_feature_counts, read_htseq_counts, read_intersect_counts, save_plot, strip_split, sum_feature_counts, sum_htseq_counts, sum_intersect_counts, test_na_file, warn_context
from .libworkflows import (
SHELL_FUNCTIONS, cleanup_and_backup, column_converter, ensure_relative,
file_len, feature_orientation2stranded, filter_combinator, get_chrom_sizes,
last_lines, make_id_list_getter, read_int_from_file, read_feature_counts,
read_htseq_counts, read_intersect_counts, save_plot, strip_split,
sum_feature_counts, sum_htseq_counts, sum_intersect_counts, test_na_file,
warn_context)
import os
import sys
from glob import glob
from shutil import rmtree
import warnings
from contextlib import contextmanager
from subprocess import Popen, PIPE
......@@ -92,6 +93,24 @@ def ensure_relative(path, basedir):
else:
return path
def cleanup_and_backup(config):
"""Performs cleanup and backup according to the information present in the
*config* dictionary."""
print("removing metadata")
# https://stackoverflow.com/a/45614282/1878788
rmtree(".snakemake/metadata")
if config.get("backup", {}):
user = config["backup"]["user"]
if user == "USER":
user = os.environ[user]
host = config["backup"]["host"]
# If no dest_dir, we assume the directory structure
# is the same on the destination host
dest_dir = config["backup"].get("dest_dir", os.getcwd())
print(f"backuping results to {user}@{host}:{dest_dir}")
shell(f"rsync -vaP {output_dir} {user}@{host}:{dest_dir}")
def read_int_from_file(filename):
"""Just reads a single integer from a file."""
with open(filename, "r") as f:
......@@ -122,6 +141,33 @@ def get_chrom_sizes(filename):
genome_size_file, "html5lib").sequencesizes.findAll("chromosome")])
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(LIB_TYPE):
def feature_stranded(wildcards):
orientation = wildcards.orientation
if orientation == "fwd":
if LIB_TYPE[-2:] == "SF":
return 1
elif LIB_TYPE[-2:] == "SR":
return 2
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "rev":
if LIB_TYPE[-2:] == "SF":
return 2
elif LIB_TYPE[-2:] == "SR":
return 1
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "all":
return 0
else:
exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
return feature_stranded
def sum_htseq_counts(counts_filename):
with open(counts_filename) as counts_file:
return sum((int(fields[1]) for fields in map(
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment