utils.py 48.7 KB
Newer Older
1
2
3
#!/usr/bin/env python3
# coding: utf-8

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# ###############################################################################
# This file is part of PanACOTA.                                                #
#                                                                               #
# Authors: Amandine Perrin                                                      #
# Copyright © 2018-2020 Institut Pasteur (Paris).                               #
# See the COPYRIGHT file for details.                                           #
#                                                                               #
# PanACOTA is a software providing tools for large scale bacterial comparative  #
# genomics. From a set of complete and/or draft genomes, you can:               #
#    -  Do a quality control of your strains, to eliminate poor quality         #
# genomes, which would not give any information for the comparative study       #
#    -  Uniformly annotate all genomes                                          #
#    -  Do a Pan-genome                                                         #
#    -  Do a Core or Persistent genome                                          #
#    -  Align all Core/Persistent families                                      #
#    -  Infer a phylogenetic tree from the Core/Persistent families             #
#                                                                               #
# PanACOTA is free software: you can redistribute it and/or modify it under the #
# terms of the Affero GNU General Public License as published by the Free       #
# Software Foundation, either version 3 of the License, or (at your option)     #
# any later version.                                                            #
#                                                                               #
# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY   #
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS     #
# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License           #
# for more details.                                                             #
#                                                                               #
# You should have received a copy of the Affero GNU General Public License      #
# along with PanACOTA (COPYING file).                                           #
# If not, see <https://www.gnu.org/licenses/>.                                  #
# ###############################################################################

36
37
38
39
40
41
42
43
"""
Util functions and classes.


@author gem
April 2017
"""

44
45
import os
import sys
46
import re
47
import glob
48
import subprocess
49
import shutil
50
import shlex
51
import progressbar
52
53
54
55
56

# Logging
import logging
from logging.handlers import RotatingFileHandler
from colorlog import ColoredFormatter
57

58
59
60
61
62
try:
    import cPickle as pickle
except:
    try:
        import _pickle as pickle
63
    except:  # pragma: no cover
64
        import pickle
65
66


Amandine  PERRIN's avatar
Amandine PERRIN committed
67
def init_logger(logfile_base, level, name, log_details=False, verbose=0, quiet=False):
Amandine  PERRIN's avatar
Amandine PERRIN committed
68
69
70
    """
    Create logger and its handlers, and set them to the given level

71
    level hierarchy: ``CRITICAL > ERROR > WARNING > INFO > DETAILS > DEBUG``
Amandine  PERRIN's avatar
Amandine PERRIN committed
72
73

    Messages from all levels are written in 'logfile'.log
74

Amandine  PERRIN's avatar
Amandine PERRIN committed
75
    Messages for levels less than WARNING (only INFO and DEBUG) written to stdout
76

77
    Messages for levels equal or higher than WARNING written to stderr
78

Amandine  PERRIN's avatar
Amandine PERRIN committed
79
80
    Messages for levels equal or higher than WARNING written in `logfile`.log.err

81
82
83
84
85
86
87
88
89
90

    Parameters
    ----------
    logfile_base : str
        base of filename to use for logs. Will add '.log', '.log.details' and '.log.err' for\
        the 3 log files created
    level : int
        minimum level that must be considered.
    name : str or None
        if we need to name the logger (used for tests)
Amandine  PERRIN's avatar
Amandine PERRIN committed
91
    log_details : bool
92
93
94
        if True, force creation of .log.details file. Otherwise, just create
        it if needed according to level

95
    verbose : int
Amandine  PERRIN's avatar
Amandine PERRIN committed
96
        be more verbose:
Amandine  PERRIN's avatar
Amandine PERRIN committed
97
98
        default (0): info in stdout, error and more in stderr ;
        info and more in *.log ; warning and more in *.log.err
Amandine  PERRIN's avatar
Amandine PERRIN committed
99
        1 = add warnings in stderr
Amandine  PERRIN's avatar
Amandine PERRIN committed
100
101
        2 = like 1 + add details to stdout (by default only INFO) + add details to *.log.details
        >15: add debug to stdout and create *.log.debug with all levels
102
103
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
Amandine  PERRIN's avatar
Amandine PERRIN committed
104
    """
105
    import time
Amandine  PERRIN's avatar
Amandine PERRIN committed
106
107

    # time when soft is launched
108
    time_start = time.strftime("_%y-%m-%d_%H-%m-%S")
Amandine  PERRIN's avatar
Amandine PERRIN committed
109

Amandine  PERRIN's avatar
Amandine PERRIN committed
110
    # create logger
111
112
    logger = logging.getLogger(name)

113
114
115
116
117
118
119
120
121
122
    # Determine logfile names
    logfile = logfile_base + ".log"
    if os.path.isfile(logfile):
        logfile = logfile_base + "-" + time_start + ".log"
    errfile = logfile_base + ".log.err"
    if os.path.isfile(errfile):
        errfile = logfile_base + "-" + time_start + ".log.err"
    detailfile = logfile_base + ".log.details"
    if os.path.isfile(detailfile):
        detailfile = logfile_base + "-" + time_start + ".log.details"
Amandine  PERRIN's avatar
Amandine PERRIN committed
123
124
125
    debugfile = logfile_base + ".log.debug"
    if os.path.isfile(debugfile):
        debugfile = logfile_base + "-" + time_start + ".log.debug"
126
127
128
129
130

    # Create a new logging level: details (between info and debug)
    # Used to add details to the log file, but not to stdout, while still having
    # the possibility to put debug messages, used only for development.
    def details(self, message, *args, **kws):
131
132
133
        """
        Define a new log level: details
        """
134
135
        if self.isEnabledFor(logging.DETAIL):
            self._log(logging.DETAIL, message, args, **kws)
Amandine  PERRIN's avatar
Amandine PERRIN committed
136
137
    logging.addLevelName(detail_lvl(), "DETAIL")
    logging.DETAIL = detail_lvl()
138
139
    logging.Logger.details = details

Amandine  PERRIN's avatar
Amandine PERRIN committed
140
    # set level of logger
Amandine  PERRIN's avatar
Amandine PERRIN committed
141
    logger.setLevel(logging.DEBUG)
142

Amandine  PERRIN's avatar
Amandine PERRIN committed
143
144
145
    # create formatter for log messages:
    # "timestamp :: level :: message"
    # (add :: %(name)s  to add the logger name)
146
    my_format = '[%(asctime)s] :: %(levelname)s :: %(message)s'
Amandine  PERRIN's avatar
Amandine PERRIN committed
147
    formatter_file = logging.Formatter(my_format, '%Y-%m-%d %H:%M:%S')
148
149
150
151
152
153
154
155
156
157
158
    my_format_stream = '%(log_color)s  * [%(asctime)s] : %(levelname)s %(reset)s %(message)s'
    formatter_stream = ColoredFormatter(my_format_stream, datefmt='%Y-%m-%d %H:%M:%S',
                                        log_colors={'DEBUG' : 'cyan',
                                                    'INFO' : 'green',
                                                    'DETAIL' : 'cyan',
                                                    'WARNING' : 'yellow',
                                                    'ERROR' : 'red',
                                                    'CRITICAL' : 'red',
                                                    })


Amandine  PERRIN's avatar
Amandine PERRIN committed
159
160
161
162
163

    # Create handler 1: writing to 'logfile'. mode 'write', max size = 1Mo.
    # If logfile is 1Mo, it is renamed to logfile.1, and next logs are still
    # written to logfile. Then, logfile.1 is renamed to logfile.2, logfile to
    # logfile.1 etc. We allow maximum 5 log files.
164
    # logfile contains everything from INFO level (INFO, WARNING, ERROR)
165
    logfile_handler = RotatingFileHandler(logfile, 'w', 10000000, 5)
Amandine  PERRIN's avatar
Amandine PERRIN committed
166
    # set level to the same as the logger level
167
    logfile_handler.setLevel(logging.INFO)
168
    logfile_handler.setFormatter(formatter_file)  # add formatter
Amandine  PERRIN's avatar
Amandine PERRIN committed
169
170
    logger.addHandler(logfile_handler)  # add handler to logger

171
    # Create handler 2: errfile. Write only warnings and errors
172
    errfile_handler = RotatingFileHandler(errfile, 'w', 10000000, 5)
Amandine  PERRIN's avatar
Amandine PERRIN committed
173
    errfile_handler.setLevel(logging.WARNING)
174
    errfile_handler.setFormatter(formatter_file)  # add formatter
Amandine  PERRIN's avatar
Amandine PERRIN committed
175
176
    logger.addHandler(errfile_handler)  # add handler to logger

Amandine  PERRIN's avatar
Amandine PERRIN committed
177

Amandine  PERRIN's avatar
Amandine PERRIN committed
178
    # Create handler 3: detailsfile. Write everything to this file, except debug
179
180
181
182
183
    # Create it only if:
    # - level is <= info (for modules which have no details, so detailsfile is the same as
    # logfile)
    # - details==True force creation of detailsfile
    # - quiet==True nothing in stdout, put all log files so that user can check
Amandine  PERRIN's avatar
Amandine PERRIN committed
184
    if level < logging.INFO or quiet or log_details:
185
        detfile_handler = RotatingFileHandler(detailfile, 'w', 10000000, 5)
Amandine  PERRIN's avatar
Amandine PERRIN committed
186
        detfile_handler.setLevel(logging.DETAIL)
187
        detfile_handler.setFormatter(formatter_file)  # add formatter
Amandine  PERRIN's avatar
Amandine PERRIN committed
188
        logger.addHandler(detfile_handler)  # add handler to logger
189

190
191
192
193
    # Formats for detailed log files
    my_format_debug = '[%(asctime)s] :: %(levelname)s (from %(name)s logger) :: %(message)s'
    formatter_file_debug = logging.Formatter(my_format_debug, '%Y-%m-%d %H:%M:%S')

Amandine  PERRIN's avatar
Amandine PERRIN committed
194
    # Create handler 4: debug file. Write everything
Amandine  PERRIN's avatar
Amandine PERRIN committed
195
196
197
    if level < logging.DETAIL:
        debugfile_handler = RotatingFileHandler(debugfile, 'w', 10000000, 5)
        debugfile_handler.setLevel(logging.DEBUG)
198
        debugfile_handler.setFormatter(formatter_file_debug)  # add formatter
Amandine  PERRIN's avatar
Amandine PERRIN committed
199
200
        logger.addHandler(debugfile_handler)  # add handler to logger

201
    # If not quiet, add handlers for stdout and stderr
202
    if not quiet:
203
        # Create handler 4: write to stdout
204
        stream_handler = logging.StreamHandler(sys.stdout)
Amandine  PERRIN's avatar
Amandine PERRIN committed
205
206
207
        # By default, write everything
        stream_handler.setLevel(logging.DEBUG)
        # BUT: don't write messages >= WARNING (warning, error, critical)
208
        stream_handler.addFilter(LessThanFilter(logging.WARNING))
Amandine  PERRIN's avatar
Amandine PERRIN committed
209
        # if not verbose (level 0 or 1): only put info in stdout (remove details and debug)
210
211
        if verbose < 2:
            stream_handler.addFilter(NoLevelFilter(logging.DETAIL))
Amandine  PERRIN's avatar
Amandine PERRIN committed
212
            stream_handler.addFilter(NoLevelFilter(logging.DEBUG))
213
        stream_handler.setFormatter(formatter_stream)
214
215
        logger.addHandler(stream_handler)  # add handler to logger

216
        # Create handler 5: write to stderr
217
        err_handler = logging.StreamHandler(sys.stderr)
Amandine  PERRIN's avatar
Amandine PERRIN committed
218

219
        if verbose > 0:
220
221
222
            err_handler.setLevel(logging.WARNING)  # write all messages >= WARNING
        else:
            err_handler.setLevel(logging.ERROR)  # write all messages >= ERROR
223
        err_handler.setFormatter(formatter_stream)
224
        logger.addHandler(err_handler)  # add handler to logger
Amandine  PERRIN's avatar
Amandine PERRIN committed
225
    logger.propagate = False
Amandine  PERRIN's avatar
Amandine PERRIN committed
226
    return logfile, logger
Amandine  PERRIN's avatar
Amandine PERRIN committed
227
228


229
230
231
232
233
234
235
class LessThanFilter(logging.Filter):
    """
    When using log, when a level is set to a handler, it is a minimum level. All
    levels higher than it will be printed. If you want to print only until
    a given level (no levels higher than the specified one), use this class like this:
    handler.addFilter(LessThanFilter(level))
    """
236

237
238
239
240
241
    def __init__(self, level):
        self._level = level
        logging.Filter.__init__(self)

    def filter(self, rec):
242
243
244
245
246
247
248
249
250
251
252
253
        """
        Function to decide if given log has to be logged or not, according to its level

        Parameters
        ----------
        rec : current record handled by logger

        Returns
        -------
        bool
            True if level of current log is less than the defined limit, False otherwise
        """
254
255
        return rec.levelno < self._level

256

257
258
259
260
261
262
263
class NoLevelFilter(logging.Filter):
    """
    When using log, specify a given level that must not be taken into account by the handler.
    This is used for the stdout handler. We want to print, by default,
    DEBUG (for development use) and INFO levels, but not DETAILS level (which is between
    DEBUG and INFO). We want to print DETAIL only if verbose option was set
    """
264

265
266
267
268
269
    def __init__(self, level):
        self._level = level
        logging.Filter.__init__(self)

    def filter(self, rec):
270
271
272
273
274
275
276
277
278
279
280
281
        """
        Function to decide if given log has to be logged or not, according to its level

        Parameters
        ----------
        rec : current record handled by logger

        Returns
        -------
        bool
            True if level of current log is different from forbidden level, False if it is the same
        """
282
283
284
        return rec.levelno != self._level


285
286
def check_installed(cmd):
    """
287
288
    Check if the command 'cmd' is in $PATH and can then be executed

289
290
291
292
293
294
295
296
297
    Parameters
    ----------
    cmd : str
        command to run

    Returns
    -------
    bool
        True if installed, False otherwise
298
    """
299
    torun = "which " + cmd
300
    trying = subprocess.Popen(shlex.split(torun), stdout=subprocess.PIPE)
301
302
303
304
305
    out, _ = trying.communicate()
    if trying.returncode == 0:
        if os.path.isfile(out.strip()):
            return True
    return False
306

307

Amandine  PERRIN's avatar
Amandine PERRIN committed
308
309
310
311
def run_cmd(cmd, error, eof=False, **kwargs):
    """
    Run the given command line. If the return code is not 0, print error message.
    if eof (exit on fail) is True, exit program if error code is not 0.
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328

    Parameters
    ----------
    cmd : str
        command to run
    error : str
        error message to print if error while running command
    eof : bool
        True: exit program if command failed, False: do not exit even if command fails
    kwargs : Object
        Can provide a logger, stdout and/or stderr streams

    Returns
    -------
    subprocess.Popen
        returns object of subprocess call (has attributes returncode, pid, communicate etc.)

Amandine  PERRIN's avatar
Amandine PERRIN committed
329
    """
330
    if "logger" not in kwargs:
331
        logger = logging.getLogger("utils.run_cmd")
332
333
    else:
        logger = kwargs["logger"]
334
    if "stdout" not in kwargs:
Amandine  PERRIN's avatar
Amandine PERRIN committed
335
        kwargs["stdout"] = None
336
    if "stderr" not in kwargs:
Amandine  PERRIN's avatar
Amandine PERRIN committed
337
        kwargs["stderr"] = None
338
    try:
339
        call = subprocess.Popen(shlex.split(cmd), stdout=kwargs["stdout"],
340
                                stderr=kwargs["stderr"])
341
342
        call.wait()
        retcode = call.returncode
343
    except OSError:
344
        logger.error(f"error: command '>{cmd}' is not possible.")
345
        if eof:
Amandine  PERRIN's avatar
Amandine PERRIN committed
346
            sys.exit(1)
347
        else:
Amandine  PERRIN's avatar
Amandine PERRIN committed
348
            return 1
Amandine  PERRIN's avatar
Amandine PERRIN committed
349
350
351
352
    if retcode != 0:
        logger.error(error)
        if eof:
            sys.exit(retcode)
353
    return call
Amandine  PERRIN's avatar
Amandine PERRIN committed
354
355


356
def plot_distr(values, limit, title, text, logger):
357
358
    """
    Plot histogram of given 'values', and add a vertical line corresponding to the chosen
359
    'limit' and return the mpl figure
360
361
362
363
364
365
366
367
368
369
370

    Parameters
    ----------
    values : list
        list of values
    limit : int
        limit for which a vertical line must be drawn
    title : str
        Title to give to plot
    text : str
        text to write near the vertical line representing the limit
371
372
    logger : logging.Logger
        logger object to write log information
373
374
375
376
377

    Returns
    -------
    matplotlib.figure.Figure
        figure generated
378
    """
379
380
    import math
    import numpy as np
381
382
383
    import matplotlib
    matplotlib.use('AGG')
    from matplotlib import pyplot as plt
384
    plt.close("all")
385
386
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(1, 1, 1)
387
    max_x = max(values)
388
389
390
    # if too many values, group them to have less bins in the histogram.
    # Put 'group_values' values in each bin ->
    # if less than 300 values, 1 value per bin, otherwise more values per bin
391
    group_values = int(max_x / 300) + 1
392
393
    dec_ax = math.exp(0.001 * max_x) - 1
    dec_text = 3 * dec_ax
394
395
396
397
398
399
    bins = np.arange(0, max_x + 2 * group_values, group_values) - 0.5
    ax.hist(values, bins=bins, edgecolor="black", color="blue")
    ax.set_xlim(0.5, max_x + 0.5 * group_values)
    ax.axvline(x=limit + 0.5 * group_values + dec_ax, color="r")
    ax.text(x=limit + 0.5 * group_values + dec_text, y=plt.ylim()[1] / 2,
            s=text + " " + str(limit), color="r", rotation=90)
400
401
    ax.set_title(title)
    return fig
402
403


404
def write_warning_skipped(skipped, do_format=False, prodigal_only=False, logfile=""):
405
406
407
408
    """
    At the end of the script, write a warning to the user with the names of the genomes
    which had problems with prokka.

409
410
411
412
413
414
415
    Parameters
    ----------
    skipped : list
        list of genomes with problems
    do_format : bool
        if False, genomes were not skipped because of format step, but before that.\
        if True, they were skipped because of format
416
417
418
    prodigal_only : bool
        if False: used prokka to annotate
        if True: used prodigal to annotate
419
    """
420
    if not prodigal_only:
Amandine  PERRIN's avatar
Amandine PERRIN committed
421
        soft = "prokka"
422
    else:
Amandine  PERRIN's avatar
Amandine PERRIN committed
423
        soft = "prodigal"
424
    logger = logging.getLogger("utils")
425
    list_to_write = "\n".join(["\t- " + genome for genome in skipped])
426
    if not do_format:
Amandine  PERRIN's avatar
Amandine PERRIN committed
427
428
        logger.info(f"WARNING: Some genomes could not be annotated. See {soft}")
        logger.warning(f"{soft} had problems while annotating some genomes, or "
Amandine  PERRIN's avatar
Amandine PERRIN committed
429
430
431
432
433
                    "did not find any gene. Hence, they are not formatted, and absent "
                    "from your output database. Please look at the "
                    "current error log "
                    "(<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err) to get more "
                    "information on the problems. Here are those "
Amandine  PERRIN's avatar
Amandine PERRIN committed
434
                    f"genomes:\n{list_to_write}")
435
    else:
Amandine  PERRIN's avatar
Amandine PERRIN committed
436
        logger.info(f"WARNING: Some genomes could not be formatted. See {logfile}")
Amandine  PERRIN's avatar
Amandine PERRIN committed
437
        logger.warning((f"Some genomes were annotated by {soft}, but could not be formatted, "
438
                        "and are hence absent from your output database. Please look at "
Amandine  PERRIN's avatar
Amandine PERRIN committed
439
440
                        "'<output_directory>/PanACoTA-annotate_list_genomes[-date].log.err' and "
                        ".details files to get more information about why they could not be "
Amandine  PERRIN's avatar
Amandine PERRIN committed
441
                        f"formatted.\n{list_to_write}"))
442
443


444
def write_genomes_info(genomes, kept_genomes, list_file, res_path, qc=False):
445
    """
446
447
448
449
450
451
452
    Write the list of genomes discarded to a file (qc=False), so that users can
    keep a trace of them, with their information (nb contigs, L90 etc.)

    If qc=True, we stop after QC.
    -> Write the list of genomes that would be kept for annotation with all
    their information (L90, size, #contig)

453

454
455
456
    Parameters
    ----------
    genomes : dict
Amandine  PERRIN's avatar
Amandine PERRIN committed
457
458
        {genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file,
                  genome_size, nb_contigs, L90]}
459
460
461
    kept_genomes : list
        list of genomes kept
    list_file : str
Amandine  PERRIN's avatar
Amandine PERRIN committed
462
        path to input file containing the list of genomes
463
464
465
    res_path : str
        folder where results must be saved
    qc : bool
466
467
468
469
470
        * True: called only if QC only. Name this file info-genomes-<list_file>.txt to put
        information on genomes that would be annotated if not QC only
        * otherwise (False), called in any case. Name this file discarded-<list_file>.txt
        and write all discarded genomes, whether sequences kept are next annotated or not
        => columns: orig_name, to_annotate, gsize, nb_conts, L90
471
    """
472
    logger = logging.getLogger("utils")
Amandine  PERRIN's avatar
Amandine PERRIN committed
473
    # number of genomes discarded
474
    nb_disc = len(genomes) - len(kept_genomes)
Amandine  PERRIN's avatar
Amandine PERRIN committed
475
    # Log number of genomes discarded.
476
    if not qc and nb_disc < 2:
Amandine  PERRIN's avatar
Amandine PERRIN committed
477
        logger.info(f"{nb_disc} genome was discarded.")
478
    elif not qc:
Amandine  PERRIN's avatar
Amandine PERRIN committed
479
        logger.info(f"{nb_disc} genomes were discarded.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
480
    # Get input list file name (without path)
481
    _, name_lst = os.path.split(list_file)
Amandine  PERRIN's avatar
Amandine PERRIN committed
482
    # if not QC, write discarded genomes to a file "discarded-[list_file].lst"
483
484
485
    if not qc:
        outdisc = os.path.join(res_path,
                               "discarded-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
486
        logger.info("Writing discarded genomes to {}".format(outdisc))
Amandine  PERRIN's avatar
Amandine PERRIN committed
487
    # if QC, there is no 'discarded genome', just write information on all analyzed genomes
488
489
    else:
        outdisc = os.path.join(res_path,
490
                               "ALL-GENOMES-info-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
Amandine  PERRIN's avatar
Amandine PERRIN committed
491
        logger.info("Writing information on genomes in {}".format(outdisc))
492
    with open(outdisc, "w") as outdf:
493
        outdf.write("\t".join(["orig_name", "to_annotate", "gsize", "nb_conts", "L90"]) + "\n")
494
495
496
        for genome, values in genomes.items():
            if genome in kept_genomes:
                continue
497
498
499
            _, _, to_annotate, gsize, nbcont, l90 = [str(x) for x in values]
            to_annotate_file = os.path.basename(to_annotate)
            outdf.write("\t".join([genome, to_annotate_file, gsize, nbcont, l90]) + "\n")
500
501
502
503
504


def write_lstinfo(list_file, genomes, outdir):
    """
    Write lstinfo file, with following columns:
Amandine  PERRIN's avatar
Amandine PERRIN committed
505
    gembase_name, orig_name, to_annotate_name, size, nbcontigs, l90
506

507
508
509
510
511
    Parameters
    ----------
    list_file : str
        input file containing the list of genomes
    genomes : dict
Amandine  PERRIN's avatar
Amandine PERRIN committed
512
        {genome: [gembase_start_name, seq_file, seq_to_annotate, genome_size, nb_contigs, L90]}
513
514
515
    outdir : str
        folder where results must be saved

516
517
    """
    _, name_lst = os.path.split(list_file)
518
    outlst = os.path.join(outdir, "LSTINFO-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
519
    with open(outlst, "w") as outf:
Amandine  PERRIN's avatar
Amandine PERRIN committed
520
521
        outf.write("\t".join(["gembase_name", "orig_name", "to_annotate", "gsize",
                              "nb_conts", "L90"]) + "\n")
522
        for genome, values in sorted(genomes.items(), key=sort_genomes_byname_l90_nbcont):
Amandine  PERRIN's avatar
Amandine PERRIN committed
523
            gembase, _, to_annote, gsize, nbcont, l90 = [str(x) for x in values]
524
            outf.write("\t".join([gembase, genome, to_annote, gsize, nbcont, l90]) + "\n")
Amandine  PERRIN's avatar
Amandine PERRIN committed
525
526
    return outlst

527

Amandine  PERRIN's avatar
Amandine PERRIN committed
528
def sort_genomes_by_name(x):
529
530
    """
    order by:
531
532
533
534
535
536
537
538
539
540
541
542
543
544

        - species
        - in each species, by strain number

    Parameters
    ----------
    x : tuple or str
        [genome_orig, [gembase, path, gsize, nbcont, L90]] with gembase = species.date.strain

    Returns
    -------
    str
        variable to take into account for sorting. If format is ESCO.1512.00001 return\
        ESCO and 00001. Otherwise, just return x itself (sort by alphabetical order)
545
    """
546
    # get gembase name
547
    if isinstance(x, tuple):
548
549
550
551
        x = x[1][0]

    # if format is ESCO.1512.00001 sort by ESCO, then 00001
    if "." in x and len(x.split(".")) >= 3:
552
        return x.split(".")[0], int(x.split(".")[-1])
553
    # if format is not like this, just return alphabetical order
554
    return x,
555
556


Amandine  PERRIN's avatar
Amandine PERRIN committed
557
558
559
560
561
562
563
564
565
566
567
def sort_genomes_byname_l90_nbcont(x):
    """
    Sort all genomes with the following criteria:

    - sort by species (x[1][0] is species.date)
    - for each species, sort by l90
    - for same l90, sort by nb contigs

    Parameters
    ----------
    x : [[]]
568
        [genome_name, [species.date, path, path_to_seq, gsize, nbcont, L90]]
Amandine  PERRIN's avatar
Amandine PERRIN committed
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597

    Returns
    -------
    tuple
        information on species, l90 and nb_contigs
    """
    return x[1][0].split(".")[0], x[1][-1], x[1][-2]


def sort_genomes_l90_nbcont(x):
    """
    Sort all genomes with the following criteria:

    - for each strain, sort by l90
    - for same l90, sort by nb contigs

    Parameters
    ----------
    x : [[]]
        [genome_name, [species.date, path, gsize, nbcont, L90]]

    Returns
    -------
    tuple
        information on l90 and nb_contigs
    """
    return x[1][-1], x[1][-2]


598
599
600
def sort_proteins(x):
    """
    order by:
601

602
603
604
    - species
    - in each species, strain number
    - in each species and strain number, by protein number
605
606
607
608
609
610
611
612
613
614
615
616

    Parameters
    ----------
    x : str
        species.date.strain.contig_protnum

    Returns
    -------
    str
        variable to take into account for sorting. If format is ESCO.1512.00001.i0002_12124,\
        return ESCO, 00001 and 12124. If not, it must be something_00001:\
        return something and 00001.
617
    """
618
    try:
619
620
        # if format is ESCO.1512.00001.i0002_12124, sort by ESCO, then 00001, then 12124
        if "." in x and len(x.split(".")) >= 3:
621
            return x.split(".")[0], int(x.split(".")[2].split("_")[0]), int(x.split("_")[-1])
622
623
        # if format is not like this, it must be something_00001:
        # sort by 'something' and then 00001
624
625
        return "_".join(x.split("_")[:-1]), int(x.split("_")[-1])
    except (IndexError, ValueError):
626
627
        logger = logging.getLogger("utils")
        logger.error(("ERROR: Protein {} does not have the required format. "
Amandine  PERRIN's avatar
Amandine PERRIN committed
628
                      "It must contain, at least <alpha-num_only>_<num_only>, and at best "
Amandine  PERRIN's avatar
Amandine PERRIN committed
629
                      "<name>.<date>.<strain_num>.<contig_info>_<prot_num>. "
630
631
                      "Please change its name.").format(x))
        sys.exit(1)
632
633


634
def read_genomes(list_file, name, date, dbpath, tmp_path, logger):
635
636
637
638
    """
    Read list of genomes, and return them.
    If a genome has a name, also return it. Otherwise, return the name given by user.

639
640
641
    Check that the given genome file exists in dbpath. Otherwise, put an error message,
    and ignore this file.

642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
    Parameters
    ----------
    list_file : str
        input file containing the list of genomes
    name : str
        Default species name
    date : str
        Default date
    dbpath : str
        path to folder containing original genome files
    tmp_path : str
        path to folder which will contain the genome files to use before annotation, if\
        needed to change them from original file (for example, merging several contig files\
        in one file, split at each stretch of 5 'N', etc.).

    Returns
    -------
    dict
        {genome: spegenus.date} spegenus.date = name.date
661
662
663
    """
    logger.info("Reading genomes")
    genomes = {}
664
    # Check that given list file exists
665
666
667
668
    if not os.path.isfile(list_file):
        logger.error(("ERROR: Your list file '{}' does not exist. "
                      "Please provide a list file.\n Ending program.").format(list_file))
        sys.exit(1)
669
    # List file exists: open it and read it
670
671
    with open(list_file, "r") as lff:
        for line in lff:
672
            line = line.strip()
673
            # empty line: go to the next one
674
            if line == "":
675
                continue
676
677
            # If separator ::, look for species and/or date
            if "::" in line:
678
679
                # genomes_inf = genome filename(s) separated by space
                # name_inf = <name>.<date>
680
681
                genomes_inf, name_inf = line.split("::")
                genomes_inf = genomes_inf.strip()
682
                cur_name, cur_date = read_info(name_inf, name, date, genomes_inf)
683
            # If no separator '::', no information on name and date of genome: use default ones
Amandine  PERRIN's avatar
Amandine PERRIN committed
684
            # Line only contains genome name (filename in given db_path)
685
686
687
688
            else:
                genomes_inf = line.strip()
                cur_name = name
                cur_date = date
689
690
691
692
693
694
695
696
697
698
699
            # If several file names, check that each one exists, and concatenate the existing files
            genomes_inf = genomes_inf.split()
            if len(genomes_inf) > 1:
                to_concat = []
                for file in genomes_inf:
                    if os.path.isfile(os.path.join(dbpath, file)):
                        to_concat.append(file)
                    else:
                        logger.warning(("{} genome file does not exist. Its file will be "
                                        "ignored when concatenating {}").format(file, genomes_inf))
                # If there are files to concatenate, concatenate them
700
                if to_concat:
701
                    genome_name = to_concat[0] + "-all.fna"
702
                    concat_file = os.path.join(tmp_path, genome_name)
703
                    to_concat = [os.path.join(dbpath, gname) for gname in to_concat]
704
                    # Put all genomes listed in 'to_concat' into a same file named 'concat_file'
705
706
                    cat(to_concat, concat_file)
                else:
707
                    # No genome file listed exists. No sequence = genome ignored
708
709
710
711
712
                    logger.warning(("None of the genome files in {} exist. "
                                    "This genome will be ignored.").format(genomes_inf))
                    genome_name = ""
            # If only 1 sequence file, check that it exists, and take its name
            else:
713
                # Genome file given does not exist
714
715
716
717
                if not os.path.isfile(os.path.join(dbpath, genomes_inf[0])):
                    logger.warning(("{} genome file does not exist. "
                                    "It will be ignored.").format(genomes_inf[0]))
                    genome_name = ""
718
                # Genome file exists: get genome_name (from information in first field)
719
720
                else:
                    genome_name = genomes_inf[0]
721
            # If there is a genome file (concatenated from several, or already existing), get the full name (with date)
722
723
            if genome_name != "":
                genomes[genome_name] = [cur_name + "." + cur_date]
724
    return genomes
725
726


Amandine  PERRIN's avatar
Amandine PERRIN committed
727
def read_genomes_info(list_file, name, date=None, logger=None):
728
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
729
    Read a lstinfo file containing the list of genomes with information (L90, genome size etc.).
730
731
    1 line per genome, 4 required columns (Others will just be ignored):
    to_annotate gsize nb_conts L90
732

Amandine  PERRIN's avatar
Amandine PERRIN committed
733
    Check that the given genome file (to_annotate column) exists.
734
735
736
737

    Parameters
    ----------
    list_file : str
738
        input file containing information on genomes (to_annotate, size, L90, nb_contigs)
739
740
741
742
    name : str
        Default species name
    date : str
        Default date
Amandine  PERRIN's avatar
Amandine PERRIN committed
743
744
745
    logger : logging.Logger
        logger object to write log information

746
747
748
749

    Returns
    -------
    dict
Amandine  PERRIN's avatar
Amandine PERRIN committed
750
751
752
        genomes = {genome:
                   [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
                  }
753
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
754
755
    if not logger:
        logger = logging.getLogger("prepare.utils")
756
    logger.info(f"Reading given information on your genomes in {list_file}")
757
    genomes = {}
Amandine  PERRIN's avatar
Amandine PERRIN committed
758
    if name and date:
Amandine  PERRIN's avatar
Amandine PERRIN committed
759
        spegenus = f"{name}.{date}"
Amandine  PERRIN's avatar
Amandine PERRIN committed
760
    column_order = {} # Put the number of column corresponding to each field
761
    if not os.path.isfile(list_file):
Amandine  PERRIN's avatar
Amandine PERRIN committed
762
        logger.error(f"ERROR: The info file {list_file} that you gave does not exist. "
Amandine  PERRIN's avatar
Amandine PERRIN committed
763
                      "Please provide the right path/name for this file.\nEnding program.")
764
        sys.exit(1)
Amandine  PERRIN's avatar
Amandine PERRIN committed
765
766
767
768
    message_no_header = (f"ERROR: It seems that your info file {list_file} does not have a "
                          "header, or this header does not have, at least, the required "
                          "columns tab separated: to_annotate, gsize nb_conts and L90 (in any "
                          "order).\nEnding program.")
769
770
771
    with open(list_file, "r") as lff:
        for line in lff:
            line = line.strip()
Amandine  PERRIN's avatar
Amandine PERRIN committed
772
773
774
            # Ignore empty lines
            if line == "":
                continue
Amandine  PERRIN's avatar
Amandine PERRIN committed
775
            # Header line: Just get column number corresponding to each field
776
            if "to_annotate" in line:
Amandine  PERRIN's avatar
Amandine PERRIN committed
777
778
                column_headers = line.split("\t")
                column_order = {header:num for num,header in enumerate(column_headers)}
779
                found = [head for head in ["to_annotate", "gsize", "nb_conts", "L90"]
Amandine  PERRIN's avatar
Amandine PERRIN committed
780
781
                         if head in column_order]
                if len(found) != 4:
Amandine  PERRIN's avatar
Amandine PERRIN committed
782
                    logger.error(message_no_header)
Amandine  PERRIN's avatar
Amandine PERRIN committed
783
                    sys.exit(1)
784
            # If no header found, error message and exit
Amandine  PERRIN's avatar
Amandine PERRIN committed
785
            if not column_order:
Amandine  PERRIN's avatar
Amandine PERRIN committed
786
                logger.error(message_no_header)
Amandine  PERRIN's avatar
Amandine PERRIN committed
787
                sys.exit(1)
788
            # Get all information on the given genome
Amandine  PERRIN's avatar
Amandine PERRIN committed
789
790
            # line.strip().split() -> all given information.
            # So, at least to_annotate, gsize, nbcont, L90
791
            # Get numeric information
Amandine  PERRIN's avatar
Amandine PERRIN committed
792
            try:
Amandine  PERRIN's avatar
Amandine PERRIN committed
793
794
                infos = line.strip().split()
                # Get genome name with its path to db_dir
Amandine  PERRIN's avatar
Amandine PERRIN committed
795
                gpath = infos[column_order["to_annotate"]]
Amandine  PERRIN's avatar
Amandine PERRIN committed
796
797
                gfile = os.path.basename(gpath)
                gname = os.path.splitext(gfile)[0]
Amandine  PERRIN's avatar
Amandine PERRIN committed
798
799
800
                gsize = int(infos[column_order["gsize"]])
                gl90 = int(infos[column_order["L90"]])
                gcont = int(infos[column_order["nb_conts"]])
Amandine  PERRIN's avatar
Amandine PERRIN committed
801
            # If invalid values, warning message and ignore genome
Amandine  PERRIN's avatar
Amandine PERRIN committed
802
            except ValueError:
Amandine  PERRIN's avatar
Amandine PERRIN committed
803
                logger.warning(f"For genome {gname}, at least one of your columns 'gsize', "
804
                                "'nb_conts' or 'L90' contains a non numeric value. "
Amandine  PERRIN's avatar
Amandine PERRIN committed
805
                                "This genome will be ignored.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
806
                continue
Amandine  PERRIN's avatar
Amandine PERRIN committed
807
808
            # If no value for at least 1 field, warning message and ignore genome
            except IndexError:
Amandine  PERRIN's avatar
Amandine PERRIN committed
809
                logger.error(f"ERROR: Check that all fields of {list_file} are filled in each "
Amandine  PERRIN's avatar
Amandine PERRIN committed
810
811
                             "line (can be 'NA')")
                sys.exit(1)
Amandine  PERRIN's avatar
Amandine PERRIN committed
812
813
            # Could we find genome file?
            # Check if genome file exists in db_path.
Amandine  PERRIN's avatar
Amandine PERRIN committed
814
            if not os.path.isfile(gpath):
Amandine  PERRIN's avatar
Amandine PERRIN committed
815
816
                logger.warning(f"{gpath} genome file does not exist. This genome will be ignored.")
                continue
Amandine  PERRIN's avatar
Amandine PERRIN committed
817
            # cur genome information to save:
818
            # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90]
Amandine  PERRIN's avatar
Amandine PERRIN committed
819
820
821
822
823
824
825
            if name and date:
                genomes[gpath] = [spegenus, gpath, gpath, gsize, gcont, gl90]
            # If called from prepare, no need to rename genomes
            else:
                gfile = os.path.basename(gpath)
                gname = os.path.splitext(gfile)[0]
                genomes[gfile] = [gname, gpath, gpath, gsize, gcont, gl90]
826
827
828
    if len(genomes) > 0:
        logger.info(("Found {} genomes in total").format(len(genomes)))
    else:
829
        logger.error(f"No genome listed in {list_file} was found.")
830
        sys.exit(1)
Amandine  PERRIN's avatar
Amandine PERRIN committed
831
    return genomes
832
833


834
835
836
837
838
839
def read_info(name_inf, name, date, genomes_inf):
    """
    From the given information in 'name_inf', check if there is a name (and if its
    format is ok) and if there is a date (and if its format is ok).
    If no name (resp. no date), return default name (resp. default date).

840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
    Parameters
    ----------
    name_inf : str
        information on current genome, which could contain a species name and a date
    name : str
        default species name
    date : str
        default date
    genomes_inf : str
        current genome filename. Used to complete information when there is a warning (species\
        name or date given not in the right format...)

    Returns
    -------
    (cur_name, cur_date) : tuple
        with:

        - curname: name to use for this genome (can be the default one, or the one read from\
        'name_inf'
        - curdate: date to use for this genome (default or read from 'name_inf')
860
    """
861
    logger = logging.getLogger("utils")
862
863
    # information on current genome, which could contain a species name and a date: split to get
    # name and date
864
    name_inf = name_inf.strip().split(".")
865
866
867
    # if only 1 field provided (means only name is provided).
    # (Otherwise, if only date is provided, it is with '.DATE', so
    # len(name_inf) = 2 with name_inf[0] = "")
868
    if len(name_inf) == 1:
869
        # No genome name: use default one
870
871
        if name_inf[0] == "":
            cur_name = name
872
        # Genome name, and in right format: use it
873
874
        elif check_format(name_inf[0]):
            cur_name = name_inf[0]
875
        # Genome name but wrong format: warning message and use default one
876
877
878
879
880
881
        else:
            logger.warning(("Invalid name {} given for genome {}. Only put "
                            "4 alphanumeric characters in your date and name. "
                            "For this genome, the default name ({}) will be "
                            "used.").format(name_inf[0], genomes_inf, name))
            cur_name = name
882
        # In any case, no given date: use default date
883
        cur_date = date
884
885
886
887
888

    # More than 2 informations (more than 2 fields separated by a '.'):
    #  - either user put more information
    #  - either user put a '.' inside the name or date field: wrong format
    # -> warning message and use default date and name.
889
890
891
892
893
894
895
    elif len(name_inf) > 2:
        logger.warning(("Invalid name/date given for genome {}. Only put "
                        "4 alphanumeric characters in your date and name. For "
                        "this genome, the default name ({}) and date ({}) will "
                        "be used.").format(genomes_inf, name, date))
        cur_name = name
        cur_date = date
896
    # information on name and date given (can be empty)
897
898
    else:
        cur_name, cur_date = name_inf
899
        # name given empty -> use default name
900
901
        if cur_name == "":
            cur_name = name
902
        # date given empty -> use default date
903
904
        if cur_date == "":
            cur_date = date
905
        # name given not empty but wrong format -> warning message and default name
906
907
908
909
910
911
        if not check_format(cur_name):
            logger.warning(("Invalid name {} given for genome {}. Only put "
                            "4 alphanumeric characters in your date and name. "
                            "For this genome, the default name ({}) "
                            "will be used.").format(cur_name, genomes_inf, name))
            cur_name = name
912
        # date given not empty but wrong format -> warning message and default date
913
914
915
916
917
918
919
920
921
        if not check_format(cur_date):
            logger.warning(("Invalid date {} given for genome {}. Only put "
                            "4 alphanumeric characters in your date and name. "
                            "For this genome, the default date ({}) "
                            "will be used.").format(cur_date, genomes_inf, date))
            cur_date = date
    return cur_name, cur_date


922
def cat(list_files, output, title=None):
923
    """
924
925
    Equivalent of 'cat' unix command.

Amandine  PERRIN's avatar
Amandine PERRIN committed
926
    Concatenate all files in 'list_files' and save result in 'output' folder.
927
928
    Concat using shutil.copyfileobj, in order to copy by chunks, to
    avoid memory problems if files are big.
929
930
931
932
933
934
935
936

    Parameters
    ----------
    list_files : list
        list of filenames to concatenate
    output : str
        output filename, where all concatenated files will be written
    title : str or None
Amandine  PERRIN's avatar
Amandine PERRIN committed
937
        if you want to show a progressbar while concatenating files, add a title for this
938
939
        progressbar here. If no title, nothing will be shown during concatenation.

940
    """
941
942
    bar = None
    curnum = None
943
944
945
    if title:
        nbfiles = len(list_files)
        widgets = [title + ': ', progressbar.Bar(marker='█', left='', right='', fill=' '),
Amandine  PERRIN's avatar
Amandine PERRIN committed
946
                   ' ', progressbar.Counter(), f"/{nbfiles}" ' (',
947
                   progressbar.Percentage(), ") - ", progressbar.Timer()]
Amandine  PERRIN's avatar
Amandine PERRIN committed
948
        bar = progressbar.ProgressBar(widgets=widgets, max_value=nbfiles, term_width=79).start()
949
        curnum = 1
950
951
    with open(output, "w") as outf:
        for file in list_files:
952
953
954
            if title:
                bar.update(curnum)
                curnum += 1
955
956
            with open(file, "r") as inf:
                shutil.copyfileobj(inf, outf)
957
958
    if title:
        bar.finish()
959
960


961
def grep(filein, pattern, counts=False):
962
    """
963
964
    Equivalent of 'grep' unix command

965
    By default, returns all the lines containing the given pattern.
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
    If counts = True, returns the number of lines containing the pattern.

    Parameters
    ----------
    filein : str
        path to the file in which pattern must be searched
    pattern : str
        pattern to search
    counts : bool
        True if you want to count how many lines have the pattern and return this number,\
        False if you want to return all lines containing the pattern.

    Returns
    -------
    list or int
        list of lines if counts=False; number of lines if counts=True
982
983
984
985
986
987
988
989
    """
    num = 0
    lines = []
    with open(filein, "r") as inf:
        for line in inf:
            if re.search(pattern, line):
                lines.append(line.strip())
                num += 1
990
    if counts:
991
992
993
994
995
996
997
        return num
    else:
        return lines


def count(filein, get="lines"):
    """
998
999
    Similar to 'wc' unix command.

1000
    Count the number of what is given in 'get'. It can be:
1001

1002