libworkflows.py 13.8 KB
Newer Older
1
import os
2
import sys
3
from glob import glob
4
from shutil import rmtree
5
6
import warnings
from contextlib import contextmanager
7
from subprocess import Popen, PIPE
8
import pandas as pd
9
import matplotlib as mpl
10
import matplotlib.pyplot as plt
11
from re import compile, sub
12
from snakemake import shell
13
from snakemake.io import apply_wildcards
14
15
# To parse genome size information
from bs4 import BeautifulSoup
16

17
18
19
20
21
22
23
24
25
26
27
OPJ = os.path.join


def formatwarning(message, category, filename, lineno, line=None):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# To be loaded in snakemake as follows:
# from libworkflows import SHELL_FUNCTIONS
# shell.prefix(SHELL_FUNCTIONS)
SHELL_FUNCTIONS = """
# http://linuxcommand.org/wss0150.php
PROGNAME=$(basename $0)

function error_exit
{{
#	----------------------------------------------------------------
#	Function for exit due to fatal program error
#		Accepts 1 argument:
#			string containing descriptive error message
#	----------------------------------------------------------------
    echo "${{PROGNAME}}: ${{1:-"Unknown Error"}}" 1>&2
    exit 1
}}

count_fastq_reads()
{{
    # $1: file in which to write the number of fastq records
    wc -l | {{ read nblines; echo ${{nblines}} / 4 | bc > ${{1}}; }}
}}

sort_by_seq()
{{
    fastq-sort -s || error_exit "fastq-sort failed"
}}

dedup()
{{
59
    sort_by_seq | remove-duplicates-from-sorted-fastq || error_exit "remove_duplicates_from_sorted_fastq failed"
60
61
62
63
64
65
}}

trim_random_nt()
{{
    # $1: nb of bases to trim at 5' end
    # $2: nb of bases to trim at 3' end
Blaise Li's avatar
Blaise Li committed
66
    cutadapt -u ${{1}} -u -${{2}} - 2> /dev/null || error_exit "trim_random_nt failed"
67
68
69
70
71
72
73
74
}}

compute_size_distribution()
{{
    # $1: file in which to write the size distribution
    awk 'NR%4==2 {{histo[length($0)]++}} END {{for (l in histo) print l"\\t"histo[l]}}' > ${{1}}
}}

75
76
77
78
samstats2mapped()
{{
    # $1: file containing the output of samtools stats
    # $2: file in which to put the number of mappers
79
    grep "^SN" ${{1}} | cut -f 2- | grep "^reads mapped:" | cut -f 2 > ${{2}}
80
}}
81
82
"""

83
84
85
86
87
88
89
90
strip = str.strip
split = str.split


def strip_split(text):
    return split(strip(text), "\t")


91
# import inspect
92
93
def texscape(text):
    """Escapes underscores to make a latex-compatible text."""
94
95
96
97
98
99
100
101
    # https://stackoverflow.com/a/2654130/1878788
    # currframe = inspect.currentframe()
    # callframe = inspect.getouterframes(currframe)
    # print(f"Escaping {text}\n(Called by {callframe})")
    # print(f"Escaping {text}")
    # return sub("_", r"\_", text)
    # To avoid double escape:
    return sub(r"([^\\])_", r"\1\_", text)
102
103


104
105
106
107
108
109
110
111
def ensure_relative(path, basedir):
    """Returns the relative path to *path* from *basedir*.
    That way, a snakefile can be included using its absolute path."""
    if os.path.isabs(path):
        return os.path.relpath(path, basedir)
    else:
        return path

112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

def wc_applied(source_function):
    """
    This can be used as a decorator for rule input sourcing functions.

    This ensures that results returned in the form of explicit output from a
    rule has the wildcard substitution applied.

    See <https://bitbucket.org/snakemake/snakemake/issues/956/placeholders-not-properly-matched-with>.
    """
    def wc_applied_source_func(wildcards):
        filename_templates = source_function(wildcards)
        if not isinstance(filename_templates, (str, bytes)):
            return [apply_wildcards(filename_template, wildcards) for filename_template in filename_templates]
        else:
            return apply_wildcards(filename_templates, wildcards)
    return wc_applied_source_func


131
def cleanup_and_backup(output_dir, config, delete=False):
132
133
134
135
136
137
138
139
140
141
142
143
144
    """Performs cleanup and backup according to the information present in the
    *config* dictionary."""
    print("removing metadata")
    # https://stackoverflow.com/a/45614282/1878788
    rmtree(".snakemake/metadata")
    if config.get("backup", {}):
        user = config["backup"]["user"]
        if user == "USER":
            user = os.environ[user]
        host = config["backup"]["host"]
        # If no dest_dir, we assume the directory structure
        # is the same on the destination host
        dest_dir = config["backup"].get("dest_dir", os.getcwd())
145
146
147
148
        if delete:
            rsync_options = "-vaP --delete"
        else:
            rsync_options = "-vaP"
149
150
151
152
153
154
155
        try:
            print(f"backuping results to {user}@{host}:{dest_dir}")
            shell(f"rsync -vaP {output_dir} {user}@{host}:{dest_dir}")
        # TODO: find the explicit and correct exception
        except:
            print(f"backuping results to {user}@myriad:{dest_dir}")
            shell(f"rsync -vaP {output_dir} {user}@myriad:{dest_dir}")
156
157


158
159
160
161
162
163
def read_int_from_file(filename):
    """Just reads a single integer from a file."""
    with open(filename, "r") as f:
        return int(f.readline().strip())


164
165
166
167
168
169
def read_float_from_file(filename):
    """Just reads a single float from a file."""
    with open(filename, "r") as f:
        return float(f.readline().strip())


170
171
172
173
174
175
@contextmanager
def warn_context(warn_filename):
    """Yields a function that writes warnings in file *warn_filename*
    (and also to stderr)."""
    with open(warn_filename, "w") as warn_file:
        def warn_to_file(message, category, filename, lineno, file=None, line=None):
176
            formatted = warnings.formatwarning(message, category, filename, lineno, line)
177
178
179
180
181
182
183
            # echo to stderr
            print(formatted, file=sys.stderr)
            warn_file.write(formatted)
        warnings.showwarning = warn_to_file
        yield warnings.warn


184
185
186
187
188
189
190
191
192
193
def get_chrom_sizes(filename):
    """Parses an illumina iGenome GenomeSize.xml file.
    Returns a dictionary with chromosome names as keys
    and chromosome lengths as values."""
    with open(filename, "r") as genome_size_file:
        return dict([
            (chrom["contigname"], int(chrom["totalbases"])) for chrom in BeautifulSoup(
                genome_size_file, "html5lib").sequencesizes.findAll("chromosome")])


194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# NOTE: It is not clear that this really does what I think it does:
# https://www.biostars.org/p/161534/
# https://www.biostars.org/p/170406/
def feature_orientation2stranded(LIB_TYPE):
    def feature_stranded(wildcards):
        orientation = wildcards.orientation
        if orientation == "fwd":
            if LIB_TYPE[-2:] == "SF":
                return 1
            elif LIB_TYPE[-2:] == "SR":
                return 2
            else:
                raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
        elif orientation == "rev":
            if LIB_TYPE[-2:] == "SF":
                return 2
            elif LIB_TYPE[-2:] == "SR":
                return 1
            else:
                raise ValueError(f"{LIB_TYPE} library type not compatible with strand-aware read counting.")
        elif orientation == "all":
            return 0
        else:
            exit("Orientation is to be among \"fwd\", \"rev\" and \"all\".")
    return feature_stranded


221
222
223
224
225
226
def sum_htseq_counts(counts_filename):
    with open(counts_filename) as counts_file:
        return sum((int(fields[1]) for fields in map(
            strip_split, counts_file) if not fields[0].startswith("__")))


227
def read_htseq_counts(counts_filename):
228
    return pd.read_table(counts_filename, header=None, index_col=0).drop(
229
230
231
232
233
234
235
236
        ["__no_feature",
         "__ambiguous",
         "__too_low_aQual",
         "__not_aligned",
         "__alignment_not_unique"],
        errors="ignore")


237
238
239
240
def sum_feature_counts(counts_filename, nb_bams=1):
    """Sums all counts in a featureCounts generated *counts_filename*.
    *nb_bams* indicates the numbre of bam files that were given to featureCounts.
    This determines which columns should be used."""
241
    # Counts are in the 7-th column, starting from third row.
242
243
    # The first sum is over the the rows, the second over the columns
    return pd.read_table(counts_filename, skiprows=2, usecols=range(6, 6 + nb_bams), header=None).sum().sum()
244
245


246
247
def read_feature_counts(counts_filename, nb_bams=1):
    return pd.read_table(counts_filename, skiprows=1, usecols=[0, *range(6, 6 + nb_bams)], index_col=0)
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275


# I	3746	3909	"WBGene00023193"	-	.	17996
# I	4118	10230	"WBGene00022277"	-	.	1848
# I	10412	16842	"WBGene00022276"	+	.	4814
# I	17482	26781	"WBGene00022278"	-	.	4282
# I	22881	23600	"WBGene00235381"	-	.	421
# I	27594	32482	"WBGene00022279"	-	.	2237
# I	31522	31543	"WBGene00170953"	+	.	110
# I	32414	32435	"WBGene00173569"	+	.	87
# I	43732	44677	"WBGene00022275"	+	.	24
# I	47471	49819	"WBGene00044345"	+	.	846
def sum_intersect_counts(counts_filename):
    """Sums all counts in a bedtools intersect generated *counts_filename*, where the annotation was in bed format."""
    # Counts are in the 7-th column
    try:
        return pd.read_table(counts_filename, usecols=[6], header=None).sum().iloc[0]
    except pd.errors.EmptyDataError:
        return "NA"

def read_intersect_counts(counts_filename):
    # index_col takes effect after column selection with usecols, hence index_col=0 (ex-third column):
    # https://stackoverflow.com/a/45943627/1878788
    try:
        return pd.read_table(counts_filename, usecols=[3,6], header=None, index_col=0)
    except pd.errors.EmptyDataError:
        return pd.DataFrame(index = [], columns = ["gene", "counts"]).set_index("gene")

276

277
278
279
280
281
282
283
284
285
286
287
288
# http://stackoverflow.com/a/845069/1878788
def file_len(fname):
    p = Popen(
        ['wc', '-l', fname],
        stdout=PIPE,
        stderr=PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])


289
290
291
292
293
294
295
296
297
298
299
def last_lines(fname, nb_lines=1):
    p = Popen(
        ["tail", f"-{nb_lines}", fname],
        stdout=PIPE,
        stderr=PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return result.decode("utf-8")


300
301
302
303
304
305
def test_na_file(fname):
    with open(fname, "r") as f:
        firstline = f.readline().strip()
    return firstline == "NA"


306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def filter_combinator(combinator, blacklist):
    """This function builds a wildcards combination generator
    based on the generator *combinator* and a set of combinations
    to exclude *blacklist*."""
    def filtered_combinator(*args, **kwargs):
        """This function generates wildcards combinations.
        It is to be used as second argument of *expand*."""
        #print(*args)
        for wc_comb in combinator(*args, **kwargs):
            # Use frozenset instead of tuple
            # in order to accomodate
            # unpredictable wildcard order
            if frozenset(wc_comb) not in blacklist:
                yield wc_comb
    return filtered_combinator

322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337

def column_converter(convert_dict, column_name=None):
    """This generates a function that can be used in *pandas.DataFrame.apply*.
    *convert_dict* will be used to generate new names from those found in the index,
    or in the column given by *column_name*."""
    get = convert_dict.get
    if column_name is None:
        def convert_name(row):
            old_name = row.name
            return get(old_name, old_name)
    else:
        def convert_name(row):
            old_name = row[column_name]
            return get(old_name, old_name)
    return convert_name

338

Blaise Li's avatar
Blaise Li committed
339
340
341
342
def save_plot(outfile,
              plot_func,
              *args,
              title=None, format=None,
343
              tight=True, equal_axes=False, square=False, rasterize=False,
Blaise Li's avatar
Blaise Li committed
344
              **kwargs):
345
    """*format* is needed when using multiple pages output."""
346
347
348
349
350
351
    # https://stackoverflow.com/a/10154763/1878788
    extra_artists = plot_func(*args, **kwargs)
    if extra_artists is not None:
        save_kwds = {"bbox_extra_artists": extra_artists}
    else:
        save_kwds = dict()
352
    if title is not None:
353
354
        usetex = mpl.rcParams.get("text.usetex", False)
        if usetex:
355
            title = texscape(title)
356
        plt.gcf().suptitle(title)
357
358
359
360
361
    # Doesn't work?
    # if "x_range" in kwargs:
    #     plt.xlim(kwargs["x_range"])
    # if "y_range" in kwargs:
    #     plt.ylim(kwargs["y_range"])
Blaise Li's avatar
Blaise Li committed
362
    if equal_axes:
363
364
        # https://stackoverflow.com/a/17996099/1878788
        plt.gca().set_aspect("equal", adjustable="box")
365
366
367
    if square:
        # https://stackoverflow.com/a/18576329/1878788
        plt.gca().set_aspect(1. / plt.gca().get_data_ratio(), adjustable="box")
Blaise Li's avatar
Blaise Li committed
368
    if tight:
369
370
        save_kwds["bbox_inches"] = "tight"
        #plt.tight_layout()
371
    if format is None:
372
        plt.savefig(outfile, **save_kwds)
373
    else:
374
        plt.savefig(outfile, format=format, rasterize=rasterize, **save_kwds)
375
    plt.close(plt.gcf())
376
377


378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def make_id_list_getter(gene_lists_dir, avail_id_lists=None):
    if avail_id_lists is None:
        avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
    str_attr_err = compile("'str' object has no attribute '.*'")
    def get_id_list(wildcards):
        """Instead of a "wildcards" object, a string can be used directly."""
        try:
            id_list = wildcards.id_list
        except AttributeError as e:
            # This may not be a "wildcards" object;
            # Try to use it as a string
            if str_attr_err.match(str(e)) is not None:
                id_list = wildcards
            else:
                raise
        # This may be directly a path
        if id_list in avail_id_lists:
            with open(id_list, "r") as infile:
Blaise Li's avatar
Blaise Li committed
396
                return [strip_split(line)[0] for line in infile.readlines() if line[0] != "#"]
397
398
399
400
401
402
        elif id_list == "lfc_statuses":
            return None
        else:
            list_filename = OPJ(gene_lists_dir, f"{id_list}_ids.txt")
            if list_filename in avail_id_lists:
                with open(list_filename, "r") as infile:
Blaise Li's avatar
Blaise Li committed
403
                    return [strip_split(line)[0] for line in infile.readlines() if line[0] != "#"]
404
405
406
            else:
                raise NotImplementedError(f"{id_list} unknown.\n")
    return get_id_list