Add start for running the whole pipeline

2d3bb554 · Amandine PERRIN · 2dc6b57c · 2d3bb554 · 2d3bb554
Commit 2d3bb554 authored 4 years ago by Amandine PERRIN
--- a/PanACoTA/subcommands/all_modules.py
+++ b/PanACoTA/subcommands/all_modules.py
+#!/usr/bin/env python3
+# coding: utf-8
+# ###############################################################################
+# This file is part of PanACOTA.                                                #
+#                                                                               #
+# Authors: Amandine Perrin                                                      #
+# Copyright © 2018-2020 Institut Pasteur (Paris).                               #
+# See the COPYRIGHT file for details.                                           #
+#                                                                               #
+# PanACOTA is a software providing tools for large scale bacterial comparative  #
+# genomics. From a set of complete and/or draft genomes, you can:               #
+#    -  Do a quality control of your strains, to eliminate poor quality         #
+# genomes, which would not give any information for the comparative study       #
+#    -  Uniformly annotate all genomes                                          #
+#    -  Do a Pan-genome                                                         #
+#    -  Do a Core or Persistent genome                                          #
+#    -  Align all Core/Persistent families                                      #
+#    -  Infer a phylogenetic tree from the Core/Persistent families             #
+#                                                                               #
+# PanACOTA is free software: you can redistribute it and/or modify it under the #
+# terms of the Affero GNU General Public License as published by the Free       #
+# Software Foundation, either version 3 of the License, or (at your option)     #
+# any later version.                                                            #
+#                                                                               #
+# PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY   #
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS     #
+# FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License           #
+# for more details.                                                             #
+#                                                                               #
+# You should have received a copy of the Affero GNU General Public License      #
+# along with PanACOTA (COPYING file).                                           #
+# If not, see <https://www.gnu.org/licenses/>.                                  #
+# ###############################################################################
+"""
+'all' is a module of PanACoTA, allowing to run the whole pipeline at once.
+@author gem
+October 2020
+"""
+import os
+import sys
+from termcolor import colored
+import sys
+def main_from_parse(args):
+    """
+    Call main function from the arguments given by parser
+    Parameters
+    ----------
+    args : argparse.Namespace
+        result of argparse parsing of all arguments in command line
+    """
+    cmd = "PanACoTA " + ' '.join(args.argv)
+    main(cmd, args.outdir, args.threads, args.NCBI_species_taxid, args.NCBI_species,
+         args.levels, args.cutn, args.l90, args.nbcont, args.name, args.prodigal_only, args.min_id,
+         args.tol, args.multi, args.mixed, args.soft, verbose=args.verbose, quiet=args.quiet)
+def main(cmd, outdir, threads, NCBI_species_taxid, NCBI_species, levels, cutn, l90, nbcont,
+         name, prodigal_only, min_id, tol, multi, mixed, soft, verbose=0, quiet=False):
+    """
+    Call all modules, one by one, using output of one as input for the next one
+    """
+    from PanACoTA import utils
+    from PanACoTA.subcommands import prepare
+    from PanACoTA.subcommands import annotate
+    from PanACoTA.subcommands import pangenome
+    from PanACoTA.subcommands import corepers
+    from PanACoTA.subcommands import align
+    from PanACoTA.subcommands import tree
+    # Run prepare module
+    outdir_prepare = os.path.join(outdir, "1-prepare_module")
+    tmp_dir = ""
+    no_refseq = False
+    db_dir = ""
+    only_mash = False
+    info_file = ""
+    min_dist = 1e-4
+    max_dist = 0.06
+    prepare.main(cmd, NCBI_species, NCBI_species_taxid, levels, outdir_prepare, tmp_dir,
+                 threads, no_refseq, db_dir, only_mash, info_file, l90, nbcont, cutn,
+                 min_dist, max_dist, verbose, quiet)
+# -> info_file
+#     # Run annotate module
+#     list_file = ""
+#     db_path = ""
+#     outdir_annotate = os.path.join(outdir, "2-annotate_module")
+#     date = ""
+#     force = False
+#     qc_only = False
+#     tmp_dir = ""
+#     res_annot_dir = None
+#     small = False
+#     annotate.main(cmd, list_file, db_path, outdir_annotate, name, date, l90, nbcont, cutn,
+#                   threads, force, qc_only, info_file, tmp_dir, res_annot_dir,
+#                   verbose, quiet, prodigal_only, small)
+def build_parser(parser):
+    """
+    Method to create a parser for command-line options
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+        The parser to configure
+    """
+    import argparse
+    from PanACoTA import utils_argparse
+    # Create command-line parser for all options and arguments to give
+    general = parser.add_argument_group("General arguments")
+    general.add_argument("-o", dest="outdir", required=True,
+                          help=("Path to your output folder, where all results "
+                                "from all 6 modules will be saved.")
+                          )
+    general.add_argument("--threads", dest="threads", type=utils_argparse.thread_num, default=1,
+                          help="Specify how many threads can be used (default=1)")
+    prepare = parser.add_argument_group("'prepare' module arguments")
+    prepare.add_argument("-t", dest="NCBI_species_taxid", default="",
+                          help=("Species taxid to download, corresponding to the "
+                                "'species taxid' provided by the NCBI. A comma-separated "
+                                "list of taxid can also be provided.")
+                         )
+    prepare.add_argument("-s", dest="NCBI_species", default="",
+                          help=("Species to download, corresponding to the "
+                                "'organism name' provided by the NCBI. Give name between "
+                                "quotes (for example \"escherichia coli\")")
+                        )
+    prepare.add_argument("-l", "--assembly_level", dest="levels", default="",
+                          help=("Assembly levels of genomes to download (default: all). "
+                                "Possible levels are: 'all', 'complete', 'chromosome', "
+                                "'scaffold', 'contig'."
+                                "You can also provide a comma-separated list of assembly "
+                                "levels. For ex: 'complete,chromosome'")
+                          )
+    prepare_annote = parser.add_argument_group("Common arguments to 'prepare' "
+                                               "and 'annotate' modules")
+    prepare_annote.add_argument("--cutn", dest="cutn", type=utils_argparse.positive_int, default=5,
+                          help=("By default, each genome will be cut into new contigs when "
+                                "at least 5 'N' in a row are found in its sequence. "
+                                "If you don't want to "
+                                "cut genomes into new contigs when there are rows of 'N', "
+                                "put 0 to this option. If you want to cut from a different number "
+                                "of 'N' in a row, put this value to this option.")
+                          )
+    prepare_annote.add_argument("--l90", dest="l90", type=int, default=100,
+                                help=("Maximum value of L90 allowed to keep a genome. "
+                                      "Default is 100.")
+                                )
+    prepare_annote.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num,
+                                default=999, help=("Maximum number of contigs allowed to "
+                                                   "keep a genome. Default is 999."))
+    annote = parser.add_argument_group("'annotate' module arguments")
+    annote.add_argument("--prodigal", dest="prodigal_only", action="store_true", default=False,
+                        help="Add this option if you only want syntactical annotation, given "
+                             "by prodigal, and not functional annotation requiring prokka and "
+                             "is slower.")
+    annote.add_argument("-n", dest="name", type=utils_argparse.gen_name,
+                        help=("Choose a name for your annotated genomes. This name should "
+                              "contain 4 alphanumeric characters. Generally, they correspond "
+                              "to the 2 first letters of genus, and 2 first letters of "
+                              "species, e.g. ESCO for Escherichia Coli."))
+    pangenome = parser.add_argument_group("'pangenome' module arguments")
+    pangenome.add_argument("-i", dest="min_id", type=perc_id, default=0.8,
+                           help=("Minimum sequence identity to be considered in the same "
+                                 "cluster (float between 0 and 1). Default is 0.8."))
+    corepers = parser.add_argument_group("'corepers' module arguments")
+    corepers.add_argument("-t", "--tol", dest="tol", default=1, type=utils_argparse.percentage,
+                          help=("min %% of genomes having at least 1 member in a family to "
+                                "consider the family as persistent (between 0 and 1, "
+                                "default is 1 = 100%% of genomes = Core genome)."
+                                "By default, the minimum number of genomes will be "
+                                "ceil('tol'*N) (N being the total number of genomes). If "
+                                "you want to use floor('tol'*N) instead, add the '-F' option."))
+    corepers.add_argument("-M", dest="multi", action='store_true',
+                          help=("Add this option if you allow several members in any genome "
+                                "of a family. By default, only 1 (or 0 if tol<1) member "
+                                "per genome are allowed in all genomes. If you want to allow "
+                                "exactly 1 member in 'tol'%% of the genomes, and 0, 1 "
+                                "or several members in the '1-tol'%% left, use the option -X "
+                                "instead of this one: -M and -X options are not compatible."))
+    corepers.add_argument("-X", dest="mixed", action='store_true',
+                          help="Add this option if you want to allow families having several "
+                               "members only in '1-tol'%% of the genomes. In the other genomes, "
+                               "only 1 member exactly is allowed. This option is not compatible "
+                               "with -M (which is allowing multigenic families: having several "
+                               "members in any number of genomes).")
+    tree = parser.add_argument_group("'tree' module arguments")
+    softs = ["fasttree", "fastme", "quicktree", "iqtree", "iqtree2"]
+    tree.add_argument("-s", "--soft", dest="soft", choices=softs, default="iqtree",
+                      help=("Choose with which software you want to infer the "
+                            "phylogenetic tree. Default is IQtree."))
+    helper = parser.add_argument_group('Others')
+    helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
+                        help="Increase verbosity in stdout/stderr.")
+    helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
+                        help=("Do not display anything to stdout/stderr. log files will "
+                              "still be created."))
+    helper.add_argument("-h", "--help", dest="help", action="help",
+                        help="show this help message and exit")
+def parse(parser, argu):
+    """
+    arse arguments given to parser
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+        the parser used
+    argu : [str]
+        command-line given by user, to parse using parser
+    Returns
+    -------
+    argparse.Namespace
+        Parsed arguments
+    """
+    import argparse
+    args = parser.parse_args(argu)
+    return args
+    # return check_args(parser, args)
+if __name__ == '__main__':
+    import argparse
+    from textwrap import dedent
+    header = '''
+     ___                 _____  ___         _____  _____
+    (  _`\              (  _  )(  _`\      (_   _)(  _  )
+    | |_) )  _ _   ___  | (_) || ( (_)   _   | |  | (_) |
+    | ,__/'/'_` )/' _ `\|  _  || |  _  /'_`\ | |  |  _  |
+    | |   ( (_| || ( ) || | | || (_( )( (_) )| |  | | | |
+    (_)   `\__,_)(_) (_)(_) (_)(____/'`\___/'(_)  (_) (_)
+       Large scale comparative genomics tools
+     -------------------------------------------
+     '''
+    my_parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
+                                        description=dedent(header), add_help=False)
+    build_parser(my_parser)
+    OPTIONS = parse(my_parser, sys.argv[1:])
+    main_from_parse(OPTIONS)
--- a/bin/PanACoTA
+++ b/bin/PanACoTA
@@ -6,6 +6,7 @@ from textwrap import dedent
 from PanACoTA import __version__ as version
+from PanACoTA.subcommands import all_modules
 from PanACoTA.subcommands import prepare
 from PanACoTA.subcommands import annotate
 from PanACoTA.subcommands import pangenome
@@ -64,6 +65,18 @@ def parse_arguments(argv):
    actions = {}  # to add the action to do according to the subparser called
    checks = {}  # to add the function to call to check the subparser arguments
+    # Running all modules at once. Start with ASCII art title, + small description of subcommand
+    parser_all = subparsers.add_parser('all_modules',
+                                        formatter_class=argparse.RawDescriptionHelpFormatter,
+                                        description=(dedent(header) +
+                                        "\n=> Run all PanACoTA modules"),
+                                        epilog=footer,
+                                        help="Run all PanACoTA modules",
+                                        add_help=False)
+    all_modules.build_parser(parser_all)
+    actions["all_modules"] = all_modules.main_from_parse
+    checks["all_modules"] = all_modules.check_args
    # Preparation part. Start with ASCII art title, + small description of subcommand
    parser_prepare = subparsers.add_parser('prepare',
                                            formatter_class=argparse.RawDescriptionHelpFormatter,