Commit 6e9f7831 authored by Blaise Li's avatar Blaise Li
Browse files

Made libworkflows a submodule.

See https://stackoverflow.com/a/18129693/1878788, and first user
comment (git rm -r, rm -r, git git submodule add).
parent bfcb15fc
......@@ -4,3 +4,6 @@
[submodule "libcelegans"]
path = libcelegans
url = git@gitlab.pasteur.fr:bli/libcelegans.git
[submodule "libworkflows"]
path = libworkflows
url = git@gitlab.pasteur.fr:bli/libworkflows.git
Subproject commit 7d2c57bc1e06be42e92801f60da8407082a21b97
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
# Backups
*~
#!/bin/sh
python3.6 setup.py build_ext
# .egg-link does not work with PYTHONPATH ?
python3.6 -m pip install -e .
python3.6 -m pip install --no-deps --ignore-installed .
from .libworkflows import (
SHELL_FUNCTIONS, cleanup_and_backup, column_converter, ensure_relative,
file_len, feature_orientation2stranded, filter_combinator, get_chrom_sizes,
last_lines, make_id_list_getter,
partial_format,
read_float_from_file, read_int_from_file,
read_feature_counts, read_htseq_counts, read_intersect_counts,
save_plot, strip_split,
sum_by_family, sum_feature_counts, sum_htseq_counts, sum_intersect_counts,
test_na_file,
texscape,
warn_context, wc_applied)
import os
import sys
from glob import glob
from shutil import rmtree
import warnings
from contextlib import contextmanager
from subprocess import Popen, PIPE
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from re import compile, sub
from snakemake import shell
from snakemake.io import apply_wildcards
# To parse genome size information
from bs4 import BeautifulSoup
OPJ = os.path.join
def formatwarning(message, category, filename, lineno, line=None):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
# To be loaded in snakemake as follows:
# from libworkflows import SHELL_FUNCTIONS
# shell.prefix(SHELL_FUNCTIONS)
SHELL_FUNCTIONS = """
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)
function error_exit
{{
# ----------------------------------------------------------------
# Function for exit due to fatal program error
# Accepts 1 argument:
# string containing descriptive error message
# ----------------------------------------------------------------
echo "${{PROGNAME}}: ${{1:-"Unknown Error"}}" 1>&2
exit 1
}}
count_fastq_reads()
{{
# $1: file in which to write the number of fastq records
wc -l | {{ read nblines; echo ${{nblines}} / 4 | bc > ${{1}}; }}
}}
sort_by_seq()
{{
niceload --mem 500M fastq-sort -s || error_exit "fastq-sort failed"
}}
dedup()
{{
sort_by_seq | niceload --mem 500M remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
}}
trim_random_nt()
{{
# $1: nb of bases to trim at 5' end
# $2: nb of bases to trim at 3' end
niceload --noswap -q cutadapt -u ${{1}} -u -${{2}} - 2> /dev/null || error_exit "trim_random_nt failed"
}}
compute_size_distribution()
{{
# $1: file in which to write the size distribution
awk 'NR%4==2 {{histo[length($0)]++}} END {{for (l in histo) print l"\\t"histo[l]}}' > ${{1}}
}}
samstats2mapped()
{{
# $1: file containing the output of samtools stats
# $2: file in which to put the number of mappers
grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped:" | cut -f 2 > ${{2}}
}}
"""
strip = str.strip
split = str.split
def strip_split(text):
return split(strip(text), "\t")
# import inspect
def texscape(text):
"""Escapes underscores to make a latex-compatible text."""
# https://stackoverflow.com/a/2654130/1878788
# currframe = inspect.currentframe()
# callframe = inspect.getouterframes(currframe)
# print(f"Escaping {text}\n(Called by {callframe})")
# print(f"Escaping {text}")
# return sub("_", r"\_", text)
# To avoid double escape:
return sub(r"([^\\])_", r"\1\_", text)
def ensure_relative(path, basedir):
"""Returns the relative path to *path* from *basedir*.
That way, a snakefile can be included using its absolute path."""
if os.path.isabs(path):
return os.path.relpath(path, basedir)
else:
return path
# https://stackoverflow.com/a/15728287/1878788
import string
from _string import formatter_field_name_split
class PartialFormatter(string.Formatter):
def get_field(self, field_name, args, kwargs):
try:
val = super(PartialFormatter, self).get_field(field_name, args, kwargs)
except (IndexError, KeyError, AttributeError):
first, _ = formatter_field_name_split(field_name)
val = '{' + field_name + '}', first
return val
partial_format = PartialFormatter().format
def wc_applied(source_function):
"""
This can be used as a decorator for rule input sourcing functions.
This ensures that results returned in the form of explicit output from a
rule has the wildcard substitution applied.
See <https://bitbucket.org/snakemake/snakemake/issues/956/placeholders-not-properly-matched-with>.
"""
def wc_applied_source_func(wildcards):
filename_templates = source_function(wildcards)
if not isinstance(filename_templates, (str, bytes)):
return [apply_wildcards(filename_template, wildcards) for filename_template in filename_templates]
else:
return apply_wildcards(filename_templates, wildcards)
return wc_applied_source_func
def cleanup_and_backup(output_dir, config, delete=False):
"""Performs cleanup and backup according to the information present in the
*config* dictionary."""
print("removing metadata")
# https://stackoverflow.com/a/45614282/1878788
rmtree(".snakemake/metadata")
if config.get("backup", {}):
user = config["backup"]["user"]
if user == "USER":
user = os.environ[user]
host = config["backup"]["host"]
# If no dest_dir, we assume the directory structure
# is the same on the destination host
dest_dir = config["backup"].get("dest_dir", os.getcwd())
# TODO: test this to use lftp to send results to caserta
#dest_dir = OPB(dest_dir)
#f"cd ..; lftp cecerelab:elegans2015@157.99.85.153 -e \"cd DataMHE/Analyses/{user}; mirror -RL {dest_dir}/; exit\""
if delete:
rsync_options = "-vaP --exclude=\"*.fastq.gz\" --delete"
else:
rsync_options = "-vaP --exclude=\"*.fastq.gz\""
# TODO: only upload logs when dest_dir[-4:] == "_err"
try:
print(f"backuping results to {user}@{host}:{dest_dir}")
shell(f"rsync -vaP {output_dir} {user}@{host}:{dest_dir}")
# TODO: find the explicit and correct exception
except:
print(f"backuping results to {user}@myriad:{dest_dir}")
shell(f"rsync -vaP {output_dir} {user}@myriad:{dest_dir}")
def read_int_from_file(filename):
"""Just reads a single integer from a file."""
with open(filename, "r") as f:
return int(f.readline().strip())
def read_float_from_file(filename):
"""Just reads a single float from a file."""
with open(filename, "r") as f:
return float(f.readline().strip())
@contextmanager
def warn_context(warn_filename):
"""Yields a function that writes warnings in file *warn_filename*
(and also to stderr)."""
with open(warn_filename, "w") as warn_file:
def warn_to_file(message, category, filename, lineno, file=None, line=None):
formatted = warnings.formatwarning(message, category, filename, lineno, line)
# echo to stderr
print(formatted, file=sys.stderr)
warn_file.write(formatted)
warnings.showwarning = warn_to_file
yield warnings.warn
def get_chrom_sizes(filename):
"""Parses an illumina iGenome GenomeSize.xml file.
Returns a dictionary with chromosome names as keys
and chromosome lengths as values."""
with open(filename, "r") as genome_size_file:
return dict([
(chrom["contigname"], int(chrom["totalbases"])) for chrom in BeautifulSoup(
genome_size_file, "html5lib").sequencesizes.findAll("chromosome")])
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(LIB_TYPE):
def feature_stranded(wildcards):
orientation = wildcards.orientation
if orientation == "fwd":
if LIB_TYPE[-2:] == "SF":
return 1
elif LIB_TYPE[-2:] == "SR":
return 2
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "rev":
if LIB_TYPE[-2:] == "SF":
return 2
elif LIB_TYPE[-2:] == "SR":
return 1
else:
raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
elif orientation == "all":
return 0
else:
exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
return feature_stranded
def sum_htseq_counts(counts_filename):
with open(counts_filename) as counts_file:
return sum((int(fields[1]) for fields in map(
strip_split, counts_file) if not fields[0].startswith("__")))
def read_htseq_counts(counts_filename):
return pd.read_csv(counts_filename, sep="\t", header=None, index_col=0).drop(
["__no_feature",
"__ambiguous",
"__too_low_aQual",
"__not_aligned",
"__alignment_not_unique"],
errors="ignore")
def sum_feature_counts(counts_filename, nb_bams=1):
"""Sums all counts in a featureCounts generated *counts_filename*.
*nb_bams* indicates the numbre of bam files that were given to featureCounts.
This determines which columns should be used."""
# Counts are in the 7-th column, starting from third row.
# The first sum is over the the rows, the second over the columns
return pd.read_csv(counts_filename, sep="\t", skiprows=2, usecols=range(6, 6 + nb_bams), header=None).sum().sum()
def read_feature_counts(counts_filename, nb_bams=1):
return pd.read_csv(counts_filename, sep="\t", skiprows=1, usecols=[0, *range(6, 6 + nb_bams)], index_col=0)
# I 3746 3909 "WBGene00023193" - . 17996
# I 4118 10230 "WBGene00022277" - . 1848
# I 10412 16842 "WBGene00022276" + . 4814
# I 17482 26781 "WBGene00022278" - . 4282
# I 22881 23600 "WBGene00235381" - . 421
# I 27594 32482 "WBGene00022279" - . 2237
# I 31522 31543 "WBGene00170953" + . 110
# I 32414 32435 "WBGene00173569" + . 87
# I 43732 44677 "WBGene00022275" + . 24
# I 47471 49819 "WBGene00044345" + . 846
def sum_intersect_counts(counts_filename):
"""Sums all counts in a bedtools intersect generated *counts_filename*, where the annotation was in bed format."""
# Counts are in the 7-th column
try:
return pd.read_csv(counts_filename, sep="\t", usecols=[6], header=None).sum().iloc[0]
except pd.errors.EmptyDataError:
return "NA"
def read_intersect_counts(counts_filename):
# index_col takes effect after column selection with usecols, hence index_col=0 (ex-third column):
# https://stackoverflow.com/a/45943627/1878788
try:
return pd.read_csv(counts_filename, sep="\t", usecols=[3,6], header=None, index_col=0)
except pd.errors.EmptyDataError:
return pd.DataFrame(index = [], columns = ["gene", "counts"]).set_index("gene")
def sum_by_family(counts_data):
"""
Add a "family" column to *counts_data* and sum the counts for a given
repeat family.
The family column is determined assuming that the index contains the repeat
family name suffixed with a ":" and a number (representing the particular
instance of the repeat).
"""
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
return counts_data.assign(family=repeat_families).groupby("family").sum()
# http://stackoverflow.com/a/845069/1878788
def file_len(fname):
p = Popen(
['wc', '-l', fname],
stdout=PIPE,
stderr=PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
def last_lines(fname, nb_lines=1):
p = Popen(
["tail", f"-{nb_lines}", fname],
stdout=PIPE,
stderr=PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return result.decode("utf-8")
def test_na_file(fname):
with open(fname, "r") as f:
firstline = f.readline().strip()
return firstline == "NA"
def filter_combinator(combinator, blacklist):
"""This function builds a wildcards combination generator
based on the generator *combinator* and a set of combinations
to exclude *blacklist*."""
def filtered_combinator(*args, **kwargs):
"""This function generates wildcards combinations.
It is to be used as second argument of *expand*."""
#print(*args)
for wc_comb in combinator(*args, **kwargs):
# Use frozenset instead of tuple
# in order to accomodate
# unpredictable wildcard order
if frozenset(wc_comb) not in blacklist:
yield wc_comb
return filtered_combinator
def column_converter(convert_dict, column_name=None):
"""This generates a function that can be used in *pandas.DataFrame.apply*.
*convert_dict* will be used to generate new names from those found in the index,
or in the column given by *column_name*."""
get = convert_dict.get
if column_name is None:
def convert_name(row):
old_name = row.name
return get(old_name, old_name)
else:
def convert_name(row):
old_name = row[column_name]
return get(old_name, old_name)
return convert_name
def save_plot(outfile,
plot_func,
*args,
title=None, format=None,
tight=True, equal_axes=False, square=False, rasterize=False,
**kwargs):
"""*format* is needed when using multiple pages output."""
# https://stackoverflow.com/a/10154763/1878788
extra_artists = plot_func(*args, **kwargs)
if extra_artists is not None:
save_kwds = {"bbox_extra_artists": extra_artists}
else:
save_kwds = dict()
if title is not None:
usetex = mpl.rcParams.get("text.usetex", False)
if usetex:
title = texscape(title)
plt.gcf().suptitle(title)
# Doesn't work?
# if "x_range" in kwargs:
# plt.xlim(kwargs["x_range"])
# if "y_range" in kwargs:
# plt.ylim(kwargs["y_range"])
if equal_axes:
# https://stackoverflow.com/a/17996099/1878788
plt.gca().set_aspect("equal", adjustable="box")
if square:
# https://stackoverflow.com/a/18576329/1878788
plt.gca().set_aspect(1. / plt.gca().get_data_ratio(), adjustable="box")
if tight:
save_kwds["bbox_inches"] = "tight"
#plt.tight_layout()
if format is None:
plt.savefig(outfile, **save_kwds)
else:
plt.savefig(outfile, format=format, rasterize=rasterize, **save_kwds)
plt.close(plt.gcf())
def make_id_list_getter(gene_lists_dir, avail_id_lists=None):
"""
*gene_lists_dir* is the directory in which gene lists are located.
*avail_id_lists* can be used to restrict the set of files to use.
If not set, all files ending in "_ids.txt" will be considered.
"""
if avail_id_lists is None:
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
str_attr_err = compile("'str' object has no attribute '.*'")
def get_id_list(wildcards):
"""Instead of a "wildcards" object, a string can be used directly."""
try:
id_list = wildcards.id_list
except AttributeError as e:
# This may not be a "wildcards" object;
# Try to use it as a string
if str_attr_err.match(str(e)) is not None:
id_list = wildcards
else:
raise
# This may be directly a path
if id_list in avail_id_lists:
with open(id_list, "r") as infile:
return [strip_split(line)[0] for line in infile.readlines() if line[0] != "#"]
elif id_list == "lfc_statuses":
return None
else:
list_filename = OPJ(gene_lists_dir, f"{id_list}_ids.txt")
if list_filename in avail_id_lists:
with open(list_filename, "r") as infile:
return [strip_split(line)[0] for line in infile.readlines() if line[0] != "#"]
else:
raise NotImplementedError(f"{id_list} unknown.\n")
return get_id_list
from setuptools import setup, find_packages
#from Cython.Build import cythonize
#import libworkflows
# Adapted from Biopython
__version__ = "Undefined"
for line in open('libworkflows/__init__.py'):
if (line.startswith('__version__')):
exec(line.strip())
setup(
name="libworkflows",
#version=libworkflows.__version__,
version=__version__,
description="Miscellaneous things to build workflows for high throughput sequencing data.",
author="Blaise Li",
author_email="blaise.li@normalesup.org",
license="MIT",
packages=find_packages())
#ext_modules = cythonize("libsmallrna/libsmallrna.pyx"),
#install_requires=["cytoolz"],
#zip_safe=False
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment