prepare.py 30.7 KB
Newer Older
Amandine  PERRIN's avatar
Amandine PERRIN committed
1
2
3
#!/usr/bin/env python3
# coding: utf-8

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# ###############################################################################
# This file is part of PanACOTA.                                                #
#                                                                               #
# Authors: Amandine Perrin                                                      #
# Copyright © 2018-2020 Institut Pasteur (Paris).                               #
# See the COPYRIGHT file for details.                                           #
#                                                                               #
# PanACOTA is a software providing tools for large scale bacterial comparative  #
# genomics. From a set of complete and/or draft genomes, you can:               #
#    -  Do a quality control of your strains, to eliminate poor quality         #
# genomes, which would not give any information for the comparative study       #
#    -  Uniformly annotate all genomes                                          #
#    -  Do a Pan-genome                                                         #
#    -  Do a Core or Persistent genome                                          #
#    -  Align all Core/Persistent families                                      #
#    -  Infer a phylogenetic tree from the Core/Persistent families             #
#                                                                               #
# PanACOTA is free software: you can redistribute it and/or modify it under the #
# terms of the Affero GNU General Public License as published by the Free       #
# Software Foundation, either version 3 of the License, or (at your option)     #
# any later version.                                                            #
#                                                                               #
# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY   #
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS     #
# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License           #
# for more details.                                                             #
#                                                                               #
# You should have received a copy of the Affero GNU General Public License      #
# along with PanACOTA (COPYING file).                                           #
# If not, see <https://www.gnu.org/licenses/>.                                  #
# ###############################################################################

Amandine  PERRIN's avatar
Amandine PERRIN committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
Subcommand to prepare a dataset:

- Download all genomes of a given species from refseq
- Filter them with L90 and number of contigs thresholds
- Remove too close/far genomes using Mash


@author Amandine PERRIN
August 2019
"""
import os
import sys
import logging
Amandine  PERRIN's avatar
Amandine PERRIN committed
50
from termcolor import colored
Amandine  PERRIN's avatar
Amandine PERRIN committed
51

Amandine  PERRIN's avatar
Amandine PERRIN committed
52
from PanACoTA import __version__ as version
Amandine  PERRIN's avatar
Amandine PERRIN committed
53
54
from PanACoTA import utils
from PanACoTA.prepare_module import download_genomes_func as dgf
Amandine  PERRIN's avatar
Amandine PERRIN committed
55
from PanACoTA.prepare_module import filter_genomes as fg
Amandine  PERRIN's avatar
Amandine PERRIN committed
56

Amandine  PERRIN's avatar
Amandine PERRIN committed
57

Amandine  PERRIN's avatar
Amandine PERRIN committed
58
59
60
61
62
63
64
65
66
67
68
def main_from_parse(arguments):
    """
    Call main function from the arguments given by parser

    Parameters
    ----------
    arguments : argparse.Namespace
        result of argparse parsing of all arguments in command line

    """
    cmd = "PanACoTA " + ' '.join(arguments.argv)
69
70
    main(cmd, arguments.ncbi_species_name, arguments.ncbi_species_taxid, arguments.ncbi_taxid, arguments.strains,
         arguments.levels, arguments.ncbi_section, arguments.outdir, arguments.tmp_dir, arguments.parallel, arguments.norefseq,
71
         arguments.db_dir, arguments.only_mash,
Amandine  PERRIN's avatar
Amandine PERRIN committed
72
         arguments.info_file, arguments.l90, arguments.nbcont, arguments.cutn, arguments.min_dist,
73
         arguments.max_dist, arguments.verbose, arguments.quiet)
Amandine  PERRIN's avatar
Amandine PERRIN committed
74
75


76
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, ncbi_section,
77
         outdir, tmp_dir, threads, norefseq, db_dir,
78
         only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet):
Amandine  PERRIN's avatar
Amandine PERRIN committed
79
80
81
82
    """
    Main method, constructing the draft dataset for the given species

    verbosity:
Amandine  PERRIN's avatar
Amandine PERRIN committed
83
84
85
86
87
    - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more
    - 1: same as 0 + WARNING in stderr
    - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
    - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug

Amandine  PERRIN's avatar
Amandine PERRIN committed
88
89
90
91
92

    Parameters
    ----------
    cmd : str
        command line used to launch this program
Amandine  PERRIN's avatar
Amandine PERRIN committed
93
    ncbi_species_name : str
Amandine  PERRIN's avatar
Amandine PERRIN committed
94
        name of species to download, as given by NCBI
Amandine  PERRIN's avatar
Amandine PERRIN committed
95
    ncbi_species_taxid : int
Amandine  PERRIN's avatar
Amandine PERRIN committed
96
        species taxid given in NCBI
Amandine  PERRIN's avatar
Amandine PERRIN committed
97
    ncbi_taxid : int
98
99
100
        NCBI taxid (sub-species)
    ncbi_strains : str
        specific strains to download
101
102
103
    levels: str
        Level of assembly to download. Choice between 'all', 'complete', 'chromosome',
        'scaffold', 'contig'. Default is 'all'
Amandine  PERRIN's avatar
Amandine PERRIN committed
104
105
    outdir : str
        path to output directory (where created database will be saved).
Amandine  PERRIN's avatar
Amandine PERRIN committed
106
    tmp_dir : str
107
        Path to directory where tmp files are saved (sequences split at each row of 5 'N')
Amandine  PERRIN's avatar
Amandine PERRIN committed
108
109
    threads : int
        max number of threads to use
Amandine  PERRIN's avatar
Amandine PERRIN committed
110
    norefseq : bool
Amandine  PERRIN's avatar
Amandine PERRIN committed
111
        True if user does not want to download again the database
112
113
    db_dir : str
        Name of the folder where already downloaded fasta files are saved.
Amandine  PERRIN's avatar
Amandine PERRIN committed
114
115
    only_mash : bool
        True if user user already has the database and quality of each genome (L90, #contigs etc.)
116
117
118
    info_file : str
        File containing information on QC if it was already ran before (columns to_annotate,
        gsize, nb_conts and L90).
Amandine  PERRIN's avatar
Amandine PERRIN committed
119
120
121
122
123
    l90 : int
        Max L90 allowed to keep a genome
    nbcont : int
        Max number of contigs allowed to keep a genome
    cutn : int
124
        cut at each when there are 'cutn' N in a row. Don't cut if equal to 0
Amandine  PERRIN's avatar
Amandine PERRIN committed
125
126
    min_dist : int
        lower limit of distance between 2 genomes to keep them
127
128
    max_dist : int
        upper limit of distance between 2 genomes to keep them (default is 0.06)
Amandine  PERRIN's avatar
Amandine PERRIN committed
129
130
    verbose : int
        verbosity:
131
132
        - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more,
          .log.err contains warning and more
Amandine  PERRIN's avatar
Amandine PERRIN committed
133
134
        - 1: same as 0 + WARNING in stderr
        - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
135
136
        - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything
          from info to debug
Amandine  PERRIN's avatar
Amandine PERRIN committed
137
138
139
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
140

Amandine  PERRIN's avatar
Amandine PERRIN committed
141
    # get species name in NCBI format
Amandine  PERRIN's avatar
Amandine PERRIN committed
142
143
    # -> will be used to name output directory
    # -> will be used to download summary file if given species corresponds to NCBI name
Amandine  PERRIN's avatar
Amandine PERRIN committed
144
145
    if ncbi_species_name:
        species_linked = "_".join(ncbi_species_name.split())
Amandine  PERRIN's avatar
Amandine PERRIN committed
146
147
        species_linked = "_".join(species_linked.split("/"))

Amandine  PERRIN's avatar
Amandine PERRIN committed
148
149
150
151
152
153
    # if species name not given by user, use species taxID (if given) to name output directory
    elif ncbi_species_taxid:
        species_linked = str(ncbi_species_taxid)
    # if species name not species taxid by user, use taxID (if given) to name output directory
    elif ncbi_taxid:
        species_linked = str(ncbi_taxid)
154
155
156
157
    # If no species nor taxID, get specific strain names
    elif ncbi_strains:
        if os.path.isfile(ncbi_strains):
            species_linked = os.path.basename(ncbi_strains)
158
            species_linked = os.path.splitext(species_linked)[0]
159
160
        else:
            species_linked = "_".join(ncbi_strains.split())
161
162
            species_linked = "-".join(species_linked.split("/"))
            species_linked = "_and_".join(species_linked.split(","))
163
    # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA
164
165
    else:
        species_linked = "NA"
Amandine  PERRIN's avatar
Amandine PERRIN committed
166
167
168
169
170
171
172
    # Default outdir is species name if given, or species taxID
    if not outdir:
        outdir = species_linked
    # Default tmp_dir is outdir/tmp_files
    if not tmp_dir:
        tmp_dir = os.path.join(outdir, "tmp_files")
    # directory that will be created by ncbi_genome_download
173
    ncbidir = os.path.join(outdir, ncbi_section, "bacteria")
Amandine  PERRIN's avatar
Amandine PERRIN committed
174
    os.makedirs(outdir, exist_ok=True)
Amandine  PERRIN's avatar
Amandine PERRIN committed
175
    os.makedirs(tmp_dir, exist_ok=True)
Amandine  PERRIN's avatar
Amandine PERRIN committed
176
177
178
179
180
181
182
183
184
185
186
187

    # Initialize logger
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl() # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked)
188
    logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', log_details=True,
Amandine  PERRIN's avatar
Amandine PERRIN committed
189
                                             verbose=verbose, quiet=quiet)
Amandine  PERRIN's avatar
Amandine PERRIN committed
190

191
    # Message on what will be done (cmd, cores used)
Amandine  PERRIN's avatar
Amandine PERRIN committed
192
    logger.info(f'PanACoTA version {version}')
Amandine  PERRIN's avatar
Amandine PERRIN committed
193
    logger.info("Command used\n \t > " + cmd)
Amandine  PERRIN's avatar
Amandine PERRIN committed
194
195
196
    message = f"'PanACoTA prepare' will run on {threads} "
    message += f"cores" if threads>1 else "core"
    logger.info(message)
Amandine  PERRIN's avatar
Amandine PERRIN committed
197

Amandine  PERRIN's avatar
Amandine PERRIN committed
198
    # Start prepare step
199
    # Run more than only mash filter (!only_mash):
Amandine  PERRIN's avatar
Amandine PERRIN committed
200
201
    # - start from QC and mash (norefseq)
    # - start from genome download (!norefseq))
Amandine  PERRIN's avatar
Amandine PERRIN committed
202
    if not only_mash:
203
204
205
        # Not only mash, so a new info file will be created. If the user still gave an info
        # file (he will be warned that it will be ignored), rename it with '.bak'
        # to avoid erasing it
206
        if info_file and os.path.isfile(info_file):
207
            os.rename(info_file, info_file + ".back")
208

Amandine  PERRIN's avatar
Amandine PERRIN committed
209
        # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes
210
        # -> if not, error and exit
Amandine  PERRIN's avatar
Amandine PERRIN committed
211
        if norefseq:
212
            logger.warning(f'You asked to skip {ncbi_section} downloads.')
213
214
215
216
217
218

            # -> if db_dir given, watch for sequences there. If does not exist, error and exit
            # (user gave a directory (even if it does not exist), so we won't look for
            # the sequences in other folders)
            if db_dir:
                if not os.path.exists(db_dir):
Amandine  PERRIN's avatar
Amandine PERRIN committed
219
220
                    logger.error(f"Database folder {db_dir} supposed to contain fasta "
                                 "sequences does not "
221
222
                                 "exist. Please give a valid folder, or leave the default "
                                 "directory (no '-d' option).")
Amandine  PERRIN's avatar
Amandine PERRIN committed
223
                    sys.exit(1)
224
225
226
227
228
229
            # -> If user did not give db_dir, genomes could be in
            # outdir/Database_init/<genome_name>.fna
            else:
                db_dir = os.path.join(outdir, "Database_init")
                # If it does not exist, check if default compressed files folder exists.
                if not os.path.exists(db_dir):
Amandine  PERRIN's avatar
Amandine PERRIN committed
230
231
                    logger.warning(f"Database folder {db_dir} supposed to contain fasta "
                                   "sequences does not "
232
233
234
235
236
                                   "exist. We will check if the download folder (with compressed "
                                   "sequences) exists.")
                    # -> if not in database_init, genomes must be in
                    # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case,
                    # uncompress and add them to Database_init
237
238
                    if not os.path.exists(ncbidir):
                        logger.error(f"Folder {ncbidir} does not exist. You do not have any "
Amandine  PERRIN's avatar
Amandine PERRIN committed
239
240
241
                                     "genome to analyse. Possible reasons:\n"
                                     "- if you want to rerun analysis in the same folder as "
                                     "sequences were downloaded (my_outdir/Database_init or "
242
                                     f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' "
Amandine  PERRIN's avatar
Amandine PERRIN committed
243
244
245
246
247
                                     "option\n"
                                     "- if you want to rerun analysis and save them in a new "
                                     "output folder called 'new_outdir', make sure you have "
                                     "'-o new_outdir' option, "
                                     "and you specified where the uncompressed sequences to "
248
                                     "use are ('-d sequence_database_path'). ")
249
250
                        sys.exit(1)
                    # add genomes from refseq/bacteria folder to Database_init
251
                    nb_gen, _ = dgf.to_database(outdir, ncbi_section)
Amandine  PERRIN's avatar
Amandine PERRIN committed
252
253
        # No sequence: Do all steps -> download, QC, mash filter
        else:
Amandine  PERRIN's avatar
Amandine PERRIN committed
254
            # Download all genomes of the given taxID
255
            db_dir, nb_gen = dgf.download_from_ncbi(species_linked, ncbi_section, ncbi_species_name, ncbi_species_taxid,
256
                                                      ncbi_taxid, ncbi_strains, levels, outdir, threads)
257
            logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded")
258

Amandine  PERRIN's avatar
Amandine PERRIN committed
259
        # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones
260
        genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90, nbcont, cutn)
261

Amandine  PERRIN's avatar
Amandine PERRIN committed
262
263
    # Do only mash filter. Genomes must be already downloaded, and there must be a file with
    # all information on these genomes (L90 etc.)
Amandine  PERRIN's avatar
Amandine PERRIN committed
264
    else:
Amandine  PERRIN's avatar
Amandine PERRIN committed
265
        logger.warning('You asked to run only mash steps.')
Amandine  PERRIN's avatar
Amandine PERRIN committed
266
        if not os.path.exists(info_file):  # info-file missing -> error and exit
267
            logger.error(f"Your info file {info_file} does not exist. Please provide the  "
268
269
                          "right name/path, or remove the '--mash-only option to rerun "
                          "quality control.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
270
            sys.exit(1)
Amandine  PERRIN's avatar
Amandine PERRIN committed
271
        logger.info(("You want to run only mash steps. Getting information "
Amandine  PERRIN's avatar
Amandine PERRIN committed
272
                     "from {}").format(info_file))
273
        genomes = utils.read_genomes_info(info_file, species_linked, )
274

Amandine  PERRIN's avatar
Amandine PERRIN committed
275
    # Run Mash
276
    # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
Amandine  PERRIN's avatar
Amandine PERRIN committed
277
278
    # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes)
    sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont)
279
280
281
282
283
284

    # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
    discQC = f"by-L90_nbcont-{species_linked}.txt"
    utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir)

    # Remove genomes not corresponding to mash filters
Amandine  PERRIN's avatar
Amandine PERRIN committed
285
    removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked,
286
                                min_dist, max_dist, threads, quiet)
287
    # Write list of genomes kept, and list of genomes discarded by mash step
288
289
    info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir, species_linked,
                                     min_dist, max_dist)
Amandine  PERRIN's avatar
Amandine PERRIN committed
290
    logger.info("End")
291
    return info_file
Amandine  PERRIN's avatar
Amandine PERRIN committed
292
293
294
295
296
297
298
299
300
301
302
303
304
305


def build_parser(parser):
    """
    Method to create a parser for command-line options

    Parameters
    ----------
    parser : argparse.ArgumentParser
        parser to configure in order to extract command-line arguments
    """
    import argparse
    from PanACoTA import utils_argparse

306
    general = parser.add_argument_group('General arguments')
Amandine  PERRIN's avatar
Amandine PERRIN committed
307
    general.add_argument("-T", dest="ncbi_species_taxid", default="",
Amandine  PERRIN's avatar
Amandine PERRIN committed
308
                          help=("Species taxid to download, corresponding to the "
Amandine  PERRIN's avatar
Amandine PERRIN committed
309
310
311
312
                                "'species taxid' provided by the NCBI. "
                                "This will download all sequences of this species and all its sub-species."
                                "A comma-separated list of species taxids can also be provided. "
                                "(Ex: -T 573 for Klebsiella pneumoniae)")
Amandine  PERRIN's avatar
Amandine PERRIN committed
313
                         )
Amandine  PERRIN's avatar
Amandine PERRIN committed
314
315
316
317
318
319
320
321
322
323
    general.add_argument("-t", dest="ncbi_taxid", default="",
                          help=("Taxid to download. "
                                "This can be the taxid of a sub-species, or of a specific strain. "
                                "A taxid of a subspecies will download all strains in this subspecies "
                                "EXCEPT the ones which have a specific taxid."
                                "A comma-separated list of taxids can also be provided."
                                "Ex: '-t 72407' will download all 'general' K. pneumoniae subsp. pneumoniae strains, "
                                "and '-t 1123862' will download the strain K. pneumoniae subsp. pneumoniae Kp13 "
                                "(not included in -t 72407, as it is a strain of the subspecies with a specific taxid).")
                         )
324
    general.add_argument("-S", dest="strains", default="",
325
                         help=("List of strains to download. "
326
327
                               "A comma-separated list of strain names is possible, "
                               "as well as a path to a filename containing one name per line."
Alexey Zabelkin's avatar
Alexey Zabelkin committed
328
329
                               "Ex: '-S SB2390, IA565' for Klebsiella pneumoniae SB2390 and Klebsiella pneumoniae IA565 strains"
                               "Ex: '-S path/to/list.txt' path to file with strain names, one per line.")
330
                         )
331
    general.add_argument("-g", dest="ncbi_species_name", default="",
Amandine  PERRIN's avatar
Amandine PERRIN committed
332
333
334
                          help=("Species to download, corresponding to the "
                                "'organism name' provided by the NCBI. Give name between "
                                "quotes (for example \"escherichia coli\")")
Amandine  PERRIN's avatar
Amandine PERRIN committed
335
                        )
336
337
338
    general.add_argument("-s", dest="ncbi_section", default="refseq", choices = ["refseq", "genbank"],
                          help=("NCBI section to download: all genbank, or only refseq (default)")
                        )
339
340
341
342
343
344
345
    general.add_argument("-l", "--assembly_level", dest="levels", default="",
                          help=("Assembly levels of genomes to download (default: all). "
                                "Possible levels are: 'all', 'complete', 'chromosome', "
                                "'scaffold', 'contig'."
                                "You can also provide a comma-separated list of assembly "
                                "levels. For ex: 'complete,chromosome'")
                          )
346
    general.add_argument("-o", dest="outdir",
Amandine  PERRIN's avatar
Amandine PERRIN committed
347
                          help=("Give the path to the directory where you want to save the "
348
349
350
351
352
                               "downloaded database. In the given directory, it will create "
                               "a folder 'Database_init' containing all fasta "
                               "files that will be sent to the control procedure, as well as "
                               "a folder 'refseq' with all original compressed files "
                               "downloaded from refseq. By default, this output dir name is the "
Amandine  PERRIN's avatar
Amandine PERRIN committed
353
                               "ncbi_species name if given, or ncbi_species_taxid or ncbi_taxid otherwise.")
354
355
356
                          )
    general.add_argument("--tmp", dest="tmp_dir",
                          help=("Specify where the temporary files (sequence split by rows "
Amandine  PERRIN's avatar
Amandine PERRIN committed
357
358
                                "of 'N', sequence with new contig names etc.) must be saved. "
                                "By default, it will be saved in your "
359
360
                                "out_dir/tmp_files.")
                          )
361
    general.add_argument("--cutn", dest="cutn", type=utils_argparse.positive_int, default=5,
362
363
364
365
366
367
368
369
370
371
372
373
                          help=("By default, each genome will be cut into new contigs when "
                                "at least 5 'N' in a row are found in its sequence. "
                                "If you don't want to "
                                "cut genomes into new contigs when there are rows of 'N', "
                                "put 0 to this option. If you want to cut from a different number "
                                "of 'N' in a row, put this value to this option.")
                          )
    general.add_argument("--l90", dest="l90", type=int, default=100,
                          help="Maximum value of L90 allowed to keep a genome. Default is 100.")
    general.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num, default=999,
                          help=("Maximum number of contigs allowed to keep a genome. "
                                "Default is 999."))
Amandine  PERRIN's avatar
Amandine PERRIN committed
374
375
    general.add_argument("--min_dist", dest="min_dist", default=1e-4,
                        type=utils_argparse.mash_dist,
376
377
378
                        help="By default, genomes whose distance to the reference is not "
                             "between 1e-4 and 0.06 are discarded. You can specify your own "
                             "lower limit (instead of 1e-4) with this option.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
379
380
381
382
383
384
385
    general.add_argument("--max_dist", dest="max_dist", default=0.06,
                         type=utils_argparse.mash_dist,
                         help="By default, genomes whose distance to the reference is not "
                              "between 1e-4 and 0.06 are discarded. You can specify your own "
                              "lower limit (instead of 0.06) with this option.")
    general.add_argument("-p", "--threads", dest="parallel", type=utils_argparse.thread_num,
                         default=1, help=("Run 'N' downloads in parallel (default=1). Put 0 if "
Amandine  PERRIN's avatar
Amandine PERRIN committed
386
                                "you want to use all cores of your computer."))
387
388

    optional = parser.add_argument_group('Alternatives')
Amandine  PERRIN's avatar
Amandine PERRIN committed
389
    optional.add_argument("--norefseq", dest="norefseq", action="store_true",
Amandine  PERRIN's avatar
Amandine PERRIN committed
390
391
                          help=("If you already downloaded refseq genomes and do not want to "
                                "check them, add this option to directly go to the next steps:"
Amandine  PERRIN's avatar
Amandine PERRIN committed
392
                                "quality control (L90, number of contigs...) and mash filter. "
393
394
                                "Don't forget to specify the db_dir (-d option) where you "
                                "already have your genome sequences."))
395
396
397
398
    optional.add_argument("-d", dest="db_dir",
                          help=("If your already downloaded sequences are not in the default "
                                "directory (outdir/Database_init), you can specify here the "
                                "path to those fasta files."))
Amandine  PERRIN's avatar
Amandine PERRIN committed
399
    optional.add_argument('-M', '--only-mash', dest="only_mash", action="store_true",
Amandine  PERRIN's avatar
Amandine PERRIN committed
400
                          help=("Add this option if you already downloaded complete and refseq "
401
402
                                "genomes, and ran quality control (you have, a file "
                                "containing all genome names, as well as their genome size, "
Amandine  PERRIN's avatar
Amandine PERRIN committed
403
404
405
                                "number of contigs and L90 values). "
                                "It will then get information on genomes quality from this "
                                "file, and run mash steps."))
Amandine  PERRIN's avatar
Amandine PERRIN committed
406
    optional.add_argument("--info", dest="info_file",
407
408
409
410
411
412
413
                          help=("If you already ran the quality control, specify from which "
                                "file PanACoTA can read this information, in order to proceed "
                                "to the mash step. This file must contain at "
                                "least 4 columns, tab separated, with the following headers: "
                                "'to_annotate', 'gsize', 'nb_conts', 'L90'. Any other column "
                                "will be ignored."))

Amandine  PERRIN's avatar
Amandine PERRIN committed
414
415
    helper = parser.add_argument_group('Others')
    helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
Amandine  PERRIN's avatar
Amandine PERRIN committed
416
                        help="Increase verbosity in stdout/stderr and log files.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
417
418
419
420
421
422
423
424
425
    helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
                        help=("Do not display anything to stdout/stderr. log files will "
                              "still be created."))
    helper.add_argument("-h", "--help", dest="help", action="help",
                        help="show this help message and exit")


def parse(parser, argu):
    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
426
    Parse arguments given to parser
Amandine  PERRIN's avatar
Amandine PERRIN committed
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463

    Parameters
    ----------
    parser : argparse.ArgumentParser
        the parser used
    argu : [str]
        command-line given by user, to parse using parser

    Returns
    -------
    argparse.Namespace
        Parsed arguments
    """
    import argparse

    args = parser.parse_args(argu)
    return check_args(parser, args)


def check_args(parser, args):
    """
    Check that arguments given to parser are as expected.

    Parameters
    ----------
    parser : argparse.ArgumentParser
        The parser used to parse command-line
    args : argparse.Namespace
        Parsed arguments

    Returns
    -------
    argparse.Namespace or None
        The arguments parsed, updated according to some rules. Exit program
        with error message if error occurs with arguments given.

    """
Amandine  PERRIN's avatar
Amandine PERRIN committed
464
465
466
467
468
469
470
471
472
    # Message if user kept default thresholds for L90 and nbcont. Just to warn him, to be sure
    # it was on purpose
    def thresholds_message(l90, nbcont):
        return ("  !! Your genomes will be filtered, and only the ones with 'L90' <= {} "
                "and 'number of contigs' < {} will be kept. If you want to change those "
                "thresholds, use '--l90' and '--nbcont' "
                "options.".format(args.l90, args.nbcont))

    #  ERRORS
473
474
475

    # We don't want to run only mash, nor only quality control, but don't give a NCBI taxID.
    # -> Give at least 1!
Amandine  PERRIN's avatar
Amandine PERRIN committed
476
    if (not args.only_mash and not args.norefseq and
477
478
        not args.ncbi_species_taxid and not args.ncbi_species_name and 
        not args.ncbi_taxid and not args.strains):
479
        parser.error("As you did not put the '--norefseq' nor the '-M' option, it means that "
480
481
482
483
                     "you want to download refseq (or genbank) genomes. But you did not provide "
                     "any information, so PanACoTA cannot guess which species you want to "
                     "download. Specify NCBI_taxid (-t), and/or NCBI species taxid (-T) "
                     "and/or NCBI_species (-g) and/or NCBI_strain (-S) to download, or add one of "
484
                     "the 2 options (--norefseq or -M) if you want to skip the 'download step'.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
485

486
487
488
    # If norefseq, give output directory
    #  - folder containing Database_init, with all sequences
    #  - or new folder where you want to put the new results
Amandine  PERRIN's avatar
Amandine PERRIN committed
489
    if args.norefseq and not args.outdir:
490
491
        parser.error("You must provide an output directory, where your results will be saved.")

492
    # If user wants only mash steps, check that he gave info file, and outdir
Amandine  PERRIN's avatar
Amandine PERRIN committed
493
    if args.only_mash and not args.info_file:
494
        parser.error("If you want to run only Mash filtering steps, please give the "
Amandine  PERRIN's avatar
Amandine PERRIN committed
495
                     "info file with the required information (see '--info' option)")
496
497
498
    if args.only_mash and not args.outdir:
        parser.error("If you want to run only Mash filtering steps, please give the "
                     "output directory where you want to save your results (see '-o' option)")
499

500
    # Cannot be verbose and quiet at the same time
501
    if int(args.verbose) > 0 and args.quiet:
502
503
504
        parser.error("Choose between a verbose output (-v) or a quiet output (-q)."
                     " You cannot have both.")

Amandine  PERRIN's avatar
Amandine PERRIN committed
505
    # min_dist must be higher than max_dist
506
    if float(args.min_dist) >= float(args.max_dist):
Amandine  PERRIN's avatar
Amandine PERRIN committed
507
508
509
        parser.error(f"min_dist ({args.min_dist}) cannot be higher "
                     f"than max_dist ({args.max_dist})")

510
511
512
513
514
515
516
517
518
    # Check that levels, if given, are among possible ones
    possible = ["all", "complete", "chromosome", "scaffold", "contig"]
    if args.levels:
        for level in args.levels.split(","):
            if level not in possible:
                parser.error("Please choose between available assembly levels: 'all', 'complete', "
                             "'chromosome', 'scaffold', 'contig'. If several levels, provide a "
                             f"comma-separated list. Invalid value: '{args.levels}'")

Amandine  PERRIN's avatar
Amandine PERRIN committed
519
520
    # WARNINGS
    # User did not specify a species name
Amandine  PERRIN's avatar
Amandine PERRIN committed
521
    if not args.ncbi_species_name and not args.outdir:
522
523
524
525
526
        if args.ncbi_species_taxid:
            print(colored("WARNING: you did not provide a species name ('-g species' option) "
                          "nor an output directory ('-o outdir'). "
                          "All files will be downloaded in a folder called with the NCBI species "
                          f"taxid {args.ncbi_species_taxid} instead of the species name.", "yellow"))
527
528
529
530
531
        elif args.strains:
            print(colored("WARNING: you did not provide a species name ('-g species' option) "
                          "nor a species taxid ('-T spetaxid') nor an output directory ('-o outdir'). "
                          "All files will be downloaded in a folder called with the specified strains "
                          f"names {args.strains} instead of the species name.", "yellow"))
532
533
534
535
536
        else:
            print(colored("WARNING: you did not provide a species name ('-g species' option) "
                          "nor a species taxid ('-T spetaxid') nor an output directory ('-o outdir'). "
                          "All files will be downloaded in a folder called with the NCBI "
                          f"taxid {args.ncbi_taxid}.", "yellow"))
Amandine  PERRIN's avatar
Amandine PERRIN committed
537
    # If user wants to cut genomes, warn him to check that it is on purpose (because default is cut at each 5'N')
Amandine  PERRIN's avatar
Amandine PERRIN committed
538
    if args.cutn == 5:
Amandine  PERRIN's avatar
Amandine PERRIN committed
539
        message = ("  !! Your genomes will be split when sequence contains at "
540
                   "least {}'N' in a row. If you want to change this threshold, use "
Amandine  PERRIN's avatar
Amandine PERRIN committed
541
                   "'--cutn n' option (n=0 if you do not want to cut)").format(args.cutn)
Amandine  PERRIN's avatar
Amandine PERRIN committed
542
543
544
545
546
547
        print(colored(message, "yellow"))

    # Warn user about selection of genomes thresholds
    if args.l90 == 100 or args.nbcont == 999:
        print(colored(thresholds_message(args.l90, args.nbcont), "yellow"))

548
    # Warn if user gave info file, but does not ask to run only Mash -> info file will be ignored
Amandine  PERRIN's avatar
Amandine PERRIN committed
549
    if (args.info_file and not args.only_mash) or (args.info_file and not args.norefseq):
Amandine  PERRIN's avatar
Amandine PERRIN committed
550
        message = ("  !! You gave an info file (--info option), but did not ask to run only Mash "
551
552
553
                   "step (-M option). Your info file will be ignored (and renamed with '.back' "
                   "at the end), and another one will "
                   "be created with the new calculated values.")
Amandine  PERRIN's avatar
Amandine PERRIN committed
554
        print(colored(message, "yellow"))
Amandine  PERRIN's avatar
Amandine PERRIN committed
555
    return args
Amandine  PERRIN's avatar
Amandine PERRIN committed
556

Amandine  PERRIN's avatar
Amandine PERRIN committed
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579


if __name__ == '__main__':
    import argparse
    from textwrap import dedent
    header = '''
     ___                 _____  ___         _____  _____
    (  _`\              (  _  )(  _`\      (_   _)(  _  )
    | |_) )  _ _   ___  | (_) || ( (_)   _   | |  | (_) |
    | ,__/'/'_` )/' _ `\|  _  || |  _  /'_`\ | |  |  _  |
    | |   ( (_| || ( ) || | | || (_( )( (_) )| |  | | | |
    (_)   `\__,_)(_) (_)(_) (_)(____/'`\___/'(_)  (_) (_)


       Large scale comparative genomics tools

     -------------------------------------------
     '''
    my_parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                        description=dedent(header), add_help=False)
    build_parser(my_parser)
    OPTIONS = parse(my_parser, sys.argv[1:])
    main_from_parse(OPTIONS)