Select Git revision
unit_test_main_utils.cpp
libworkflows.py 11.43 KiB
import os
import sys
from glob import glob
from shutil import rmtree
import warnings
from contextlib import contextmanager
from subprocess import Popen, PIPE
import pandas as pd
import matplotlib.pyplot as plt
from re import compile
# To parse genome size information
from bs4 import BeautifulSoup
OPJ = os.path.join
def formatwarning(message, category, filename, lineno, line=None):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
# To be loaded in snakemake as follows:
# from libworkflows import SHELL_FUNCTIONS
# shell.prefix(SHELL_FUNCTIONS)
SHELL_FUNCTIONS = """
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)
function error_exit
{{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo "${{PROGNAME}}: ${{1:-"Unknown Error"}}" 1>&2
exit 1
}}
count_fastq_reads()
{{
# $1: file in which to write the number of fastq records
wc -l | {{ read nblines; echo ${{nblines}} / 4 | bc > ${{1}}; }}
}}
sort_by_seq()
{{
fastq-sort -s || error_exit "fastq-sort failed"
}}
dedup()
{{
sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}}
trim_random_nt()
{{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
cutadapt -u ${{1}} -u -${{2}} - 2> /dev/null || error_exit "trim_random_nt failed"
}}
compute_size_distribution()
{{
# $1: file in which to write the size distribution
awk 'NR%4==2 {{histo[length($0)]++}} END {{for (l in histo) print l"\\t"histo[l]}}' > ${{1}}
}}
samstats2mapped()
{{
# $1: file containing the output of samtools stats
# $2: file in which to put the number of mappers
grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped:" | cut -f 2 > ${{2}}
}}
"""
strip = str.strip
split = str.split
def strip_split(text):
return split(strip(text), "\t")
def ensure_relative(path, basedir):
"""Returns the relative path to *path* from *basedir*.
That way, a snakefile can be included using its absolute path."""
if os.path.isabs(path):
return os.path.relpath(path, basedir)
else:
return path
def cleanup_and_backup(config):
"""Performs cleanup and backup according to the information present in the
*config* dictionary."""
print("removing metadata")
# https://stackoverflow.com/a/45614282/1878788
rmtree(".snakemake/metadata")
if config.get("backup", {}):
user = config["backup"]["user"]
if user == "USER":
user = os.environ[user]
host = config["backup"]["host"]
# If no dest_dir, we assume the directory structure
# is the same on the destination host
dest_dir = config["backup"].get("dest_dir", os.getcwd())
print(f"backuping results to {user}@{host}:{dest_dir}")
shell(f"rsync -vaP {output_dir} {user}@{host}:{dest_dir}")
def read_int_from_file(filename):
"""Just reads a single integer from a file."""
with open(filename, "r") as f:
return int(f.readline().strip())
@contextmanager
def warn_context(warn_filename):
"""Yields a function that writes warnings in file *warn_filename*
(and also to stderr)."""
with open(warn_filename, "w") as warn_file:
def warn_to_file(message, category, filename, lineno, file=None, line=None):
formatted = warnings.formatwarning(message, category, filename, lineno)
# echo to stderr
print(formatted, file=sys.stderr)
warn_file.write(formatted)
warnings.showwarning = warn_to_file
yield warnings.warn
def get_chrom_sizes(filename):
"""Parses an illumina iGenome GenomeSize.xml file.
Returns a dictionary with chromosome names as keys
and chromosome lengths as values."""
with open(filename, "r") as genome_size_file:
return dict([
(chrom["contigname"], int(chrom["totalbases"])) for chrom in BeautifulSoup(
genome_size_file, "html5lib").sequencesizes.findAll("chromosome")])
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(LIB_TYPE):
def feature_stranded(wildcards):
orientation = wildcards.orientation
if orientation == "fwd":
if LIB_TYPE[-2:] == "SF":
return 1
elif LIB_TYPE[-2:] == "SR":
return 2
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "rev":
if LIB_TYPE[-2:] == "SF":
return 2
elif LIB_TYPE[-2:] == "SR":
return 1
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "all":
return 0
else:
exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
return feature_stranded
def sum_htseq_counts(counts_filename):
with open(counts_filename) as counts_file:
return sum((int(fields[1]) for fields in map(
strip_split, counts_file) if not fields[0].startswith("__")))
def read_htseq_counts(counts_filename):
return pd.read_table(counts_filename, header=None, index_col=0).drop(
["__no_feature",
"__ambiguous",
"__too_low_aQual",
"__not_aligned",
"__alignment_not_unique"],
errors="ignore")
def sum_feature_counts(counts_filename, nb_bams=1):
"""Sums all counts in a featureCounts generated *counts_filename*.
*nb_bams* indicates the numbre of bam files that were given to featureCounts.
This determines which columns should be used."""
# Counts are in the 7-th column, starting from third row.
# The first sum is over the the rows, the second over the columns
return pd.read_table(counts_filename, skiprows=2, usecols=range(6, 6 + nb_bams), header=None).sum().sum()
def read_feature_counts(counts_filename, nb_bams=1):
return pd.read_table(counts_filename, skiprows=1, usecols=[0, *range(6, 6 + nb_bams)], index_col=0)
# I 3746 3909 "WBGene00023193" - . 17996
# I 4118 10230 "WBGene00022277" - . 1848
# I 10412 16842 "WBGene00022276" + . 4814
# I 17482 26781 "WBGene00022278" - . 4282
# I 22881 23600 "WBGene00235381" - . 421
# I 27594 32482 "WBGene00022279" - . 2237
# I 31522 31543 "WBGene00170953" + . 110
# I 32414 32435 "WBGene00173569" + . 87
# I 43732 44677 "WBGene00022275" + . 24
# I 47471 49819 "WBGene00044345" + . 846
def sum_intersect_counts(counts_filename):
"""Sums all counts in a bedtools intersect generated *counts_filename*, where the annotation was in bed format."""
# Counts are in the 7-th column
try:
return pd.read_table(counts_filename, usecols=[6], header=None).sum().iloc[0]
except pd.errors.EmptyDataError:
return "NA"
def read_intersect_counts(counts_filename):
# index_col takes effect after column selection with usecols, hence index_col=0 (ex-third column):
# https://stackoverflow.com/a/45943627/1878788
try:
return pd.read_table(counts_filename, usecols=[3,6], header=None, index_col=0)
except pd.errors.EmptyDataError:
return pd.DataFrame(index = [], columns = ["gene", "counts"]).set_index("gene")
# http://stackoverflow.com/a/845069/1878788
def file_len(fname):
p = Popen(
['wc', '-l', fname],
stdout=PIPE,
stderr=PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
def last_lines(fname, nb_lines=1):
p = Popen(
["tail", f"-{nb_lines}", fname],
stdout=PIPE,
stderr=PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return result.decode("utf-8")
def test_na_file(fname):
with open(fname, "r") as f:
firstline = f.readline().strip()
return firstline == "NA"
def filter_combinator(combinator, blacklist):
"""This function builds a wildcards combination generator
based on the generator *combinator* and a set of combinations
to exclude *blacklist*."""
def filtered_combinator(*args, **kwargs):
"""This function generates wildcards combinations.
It is to be used as second argument of *expand*."""
#print(*args)
for wc_comb in combinator(*args, **kwargs):
# Use frozenset instead of tuple
# in order to accomodate
# unpredictable wildcard order
if frozenset(wc_comb) not in blacklist:
yield wc_comb
return filtered_combinator
def column_converter(convert_dict, column_name=None):
"""This generates a function that can be used in *pandas.DataFrame.apply*.
*convert_dict* will be used to generate new names from those found in the index,
or in the column given by *column_name*."""
get = convert_dict.get
if column_name is None:
def convert_name(row):
old_name = row.name
return get(old_name, old_name)
else:
def convert_name(row):
old_name = row[column_name]
return get(old_name, old_name)
return convert_name
def save_plot(outfile,
plot_func,
*args,
title=None, format=None,
tight=True, equal_axes=False,
**kwargs):
"""*format* is needed when using multiple pages output."""
# https://stackoverflow.com/a/10154763/1878788
extra_artists = plot_func(*args, **kwargs)
if extra_artists is not None:
save_kwds = {"bbox_extra_artists": extra_artists}
else:
save_kwds = dict()
if title is not None:
plt.gcf().suptitle(title)
if equal_axes:
plt.axis("equal")
if tight:
save_kwds["bbox_inches"] = "tight"
#plt.tight_layout()
if format is None:
plt.savefig(outfile, **save_kwds)
else:
plt.savefig(outfile, format=format, **save_kwds)
def make_id_list_getter(gene_lists_dir, avail_id_lists=None):
if avail_id_lists is None:
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
str_attr_err = compile("'str' object has no attribute '.*'")
def get_id_list(wildcards):
"""Instead of a "wildcards" object, a string can be used directly."""
try:
id_list = wildcards.id_list
except AttributeError as e:
# This may not be a "wildcards" object;
# Try to use it as a string
if str_attr_err.match(str(e)) is not None:
id_list = wildcards
else:
raise
# This may be directly a path
if id_list in avail_id_lists:
with open(id_list, "r") as infile:
return [strip_split(line)[0] for line in infile.readlines() if line[0] != "#"]
elif id_list == "lfc_statuses":
return None
else:
list_filename = OPJ(gene_lists_dir, f"{id_list}_ids.txt")
if list_filename in avail_id_lists:
with open(list_filename, "r") as infile:
return [strip_split(line)[0] for line in infile.readlines() if line[0] != "#"]
else:
raise NotImplementedError(f"{id_list} unknown.\n")
return get_id_list