Skip to content
Snippets Groups Projects
Select Git revision
  • 7338b5c9f96d935773a5677260db6d2cf917d6c5
  • master default protected
  • dev
  • install
  • new_master
  • protein_ortho
  • documentation
  • pr18
  • dev-licence
  • docker
  • prodigal_train
  • containers
  • module_all
  • functional_tests
  • opti
  • helpers
  • v1.4.1
  • v1.4.0
  • v1.3.1
  • v1.3.0
  • v1.2.0
  • v1.1.0
  • v1.0.1
  • v1.0
24 results

pangenome.py

Blame
  • pangenome.py 11.67 KiB
    #!/usr/bin/env python3
    # coding: utf-8
    
    # ###############################################################################
    # This file is part of PanACOTA.                                                #
    #                                                                               #
    # Authors: Amandine Perrin                                                      #
    # Copyright © 2018-2020 Institut Pasteur (Paris).                               #
    # See the COPYRIGHT file for details.                                           #
    #                                                                               #
    # PanACOTA is a software providing tools for large scale bacterial comparative  #
    # genomics. From a set of complete and/or draft genomes, you can:               #
    #    -  Do a quality control of your strains, to eliminate poor quality         #
    # genomes, which would not give any information for the comparative study       #
    #    -  Uniformly annotate all genomes                                          #
    #    -  Do a Pan-genome                                                         #
    #    -  Do a Core or Persistent genome                                          #
    #    -  Align all Core/Persistent families                                      #
    #    -  Infer a phylogenetic tree from the Core/Persistent families             #
    #                                                                               #
    # PanACOTA is free software: you can redistribute it and/or modify it under the #
    # terms of the Affero GNU General Public License as published by the Free       #
    # Software Foundation, either version 3 of the License, or (at your option)     #
    # any later version.                                                            #
    #                                                                               #
    # PanACOTA is distributed in the hope that it will be useful, but WITHOUT ANY   #
    # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS     #
    # FOR A PARTICULAR PURPOSE. See the Affero GNU General Public License           #
    # for more details.                                                             #
    #                                                                               #
    # You should have received a copy of the Affero GNU General Public License      #
    # along with PanACOTA (COPYING file).                                           #
    # If not, see <https://www.gnu.org/licenses/>.                                  #
    # ###############################################################################
    
    """
    pangenome is a subcommand of PanACoTA
    
    
    @author gem
    May 2017
    """
    
    import sys
    import os
    
    
    def main_from_parse(args):
        """
        Call main function from the arguments given by parser
    
        Parameters
        ----------
        args : argparse.Namespace
            result of argparse parsing of all arguments in command line
        """
        cmd = "PanACoTA " + ' '.join(args.argv)
        main(cmd, args.lstinfo_file, args.dataset_name, args.dbpath, args.min_id, args.outdir,
             args.clust_mode, args.spedir, args.threads, args.outfile, args.verbose,
             args.quiet)
    
    
    def main(cmd, lstinfo, name, dbpath, min_id, outdir, clust_mode, spe_dir, threads, outfile=None,
             verbose=0, quiet=False):
        """
        Main method, doing all steps:
    
        - concatenate all protein files
        - create database as ffindex
        - cluster all proteins
        - convert to pangenome file
        - creating summary and matrix of pangenome
    
        Parameters
        ----------
        lstinfo : str
            file with name of genomes to consider for pan in the first column, without extension.
            Other columns are ignored. The first column header must be 'gembase_name'
        name : str
            name given to the dataset. For example, ESCO44 for 44 *Escherichia coli* genomes.
        dbpath : str
            path to the folder containing all protein files (files called as the name of genome
            given in lstinfo + ".prt"
        min_id : float
            Minimum percentage of identity between 2 proteins to put them in the same family
        outdir : str
            path to folder which will contain pangenome results and tmp files
        clust_mode : [0, 1, 2]
            0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
        spe_dir : str or None
            path to the folder where concatenated bank of proteins must be saved.
            None to use the same folder as protein files
        threads : int
            Max number of threads to use
        outfile : str or None
            Name of the pangenome. None to use the default name
        verbose : int
                verbosity:
            - defaut 0 : stdout contains INFO, stderr contains ERROR.
            - 1: stdout contains INFO, stderr contains WARNING and ERROR
            - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
            - >=15: Add DEBUG in stdout
        quiet : bool
            True if nothing must be sent to stdout/stderr, False otherwise
        """
        # import needed packages
        import logging
        from PanACoTA import utils
        from PanACoTA.pangenome_module import protein_seq_functions as protf
        from PanACoTA.pangenome_module import mmseqs_functions as mmf
        from PanACoTA.pangenome_module import post_treatment as pt
        from PanACoTA import __version__ as version
    
        # test if mmseqs is installed and in the path
        if not utils.check_installed("mmseqs"):  # pragma: no cover
            print("mmseqs is not installed. 'PanACoTA pangenome' cannot run.")
            sys.exit(1)
    
        os.makedirs(outdir, exist_ok=True)
        # level is the minimum level that will be considered.
        # for verbose = 0 or 1, ignore details and debug, start from info
        if verbose <= 1:
            level = logging.INFO
        # for verbose = 2, ignore only debug
        if verbose >= 2 and verbose < 15:
            level = utils.detail_lvl() # int corresponding to detail level
        # for verbose >= 15, write everything
        if verbose >= 15:
            level = logging.DEBUG
        # name logfile, add timestamp if already existing
        logfile_base = os.path.join(outdir, "PanACoTA-pangenome_" + name)
        utils.init_logger(logfile_base, level, '', verbose=verbose, quiet=quiet, log_details=True)
        logger = logging.getLogger("pangenome")
        logger.info(f'PanACoTA version {version}')
        logger.info("Command used\n \t > " + cmd)
    
        # Build bank with all proteins to include in the pangenome
        logger.info("build prt bank")
        prt_path = protf.build_prt_bank(lstinfo, dbpath, name, spe_dir, quiet)
        # Do pangenome
        families, panfile = mmf.run_all_pangenome(min_id, clust_mode, outdir,
                                                  prt_path, threads, outfile, quiet)
        # Create matrix pan_quali, pan_quanti and summary file
        pt.post_treat(families, panfile)
        logger.info("DONE")
        return panfile
    
    
    def build_parser(parser):
        """
        Method to create a parser for command-line options
    
        Parameters
        ----------
        parser : argparse.ArgumentParser
            parser to configure in order to extract command-line arguments
        """
        import argparse
        from PanACoTA import utils_argparse
    
        # Create command-line parser for all options and arguments to give
        required = parser.add_argument_group('Required arguments')
        required.add_argument("-l", dest="lstinfo_file", required=True,
                              help=("File containing the list of all genomes to include in "
                                    "the pan-genome, 1 genome per line: it can be the "
                                    "LSTINFO-<list_file>.lst file of 'PanACoTA annotate' module."
                                    "Here, only the first column (genome name without extension) "
                                    "will be used. All proteins of all these genomes will be "
                                    "concatenated in a file called <dataset_name>.All.prt. The "
                                    "column header must be 'gembase_name'."))
        required.add_argument("-n", dest="dataset_name", required=True,
                              help=("Name of the dataset which will be clustered (for example, "
                                    "SAEN1234 for 1234 Salmonella enterica genomes). This name will "
                                    "be used to name the protein databank, a well as the "
                                    "pangenome files."))
        required.add_argument("-d", dest="dbpath", required=True,
                              help=("Path to the folder containing all protein files corresponding "
                                    "to the genomes of the dataset (output directory 'Proteins' "
                                    "of 'PanACoTA annotate' module)."))
        required.add_argument("-o", dest="outdir", required=True,
                              help=("Output directory, where all results must be saved "
                                    "(including tmp folder)"))
    
        optional = parser.add_argument_group('Optional arguments')
        optional.add_argument("-i", dest="min_id", type=utils_argparse.perc_id, default=0.8,
                              help=("Minimum sequence identity to be considered in the same "
                                    "cluster (float between 0 and 1). Default is 0.8."))
        optional.add_argument("-f", dest="outfile",
                              help=("Use this option if you want to give the name of the pangenome "
                                    "output file (without path). Otherwise, by default, it is called "
                                    "PanGenome-mmseq_<given_dataset_name>.All.prt_<"
                                    "information_on_parameters>.lst"))
        optional.add_argument("-c", dest="clust_mode", type=int, choices=[0, 1, 2], default=1,
                              help=("Choose the clustering mode: 0 for 'set cover', 1 for "
                                    "'single-linkage', 2 for 'CD-Hit'. Default "
                                    "is 'single-linkage' (1)"))
        optional.add_argument("-s", dest="spedir",
                              help=("use this option if you want to save the concatenated protein "
                                    "databank in another directory than the one containing all "
                                    "individual protein files ('Proteins' folder)."))
        optional.add_argument("--threads", dest="threads", default=1, type=utils_argparse.thread_num,
                              help=("add this option if you want to parallelize on several threads. "
                                    "Indicate on how many threads you want to parallelize. "
                                    "By default, it uses 1 thread. Put 0 if you want to use "
                                    "all threads of your computer."))
    
        helper = parser.add_argument_group('Others')
        helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
                            help="Increase verbosity in stdout/stderr.")
        helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
                            help=("Do not display anything to stdout/stderr. log files will "
                                  "still be created."))
        helper.add_argument("-h", "--help", dest="help", action="help",
                            help="show this help message and exit")
    
    
    def parse(parser, argu):
        """
        Parse arguments given to parser
    
        Parameters
        ----------
        parser : argparse.ArgumentParser
            Parser to use to parse command-line arguments
        argu : [str]
            command-line given
    
        Returns
        -------
        argparse.Namespace or None
            The arguments parsed, updated according to some rules. Exit program
            with error message if error occurs with arguments given.
        """
        args = parser.parse_args(argu)
        return args
    
    
    if __name__ == '__main__':
        import argparse
    
        my_parser = argparse.ArgumentParser(description="Compute pan-genome", add_help=False)
        build_parser(my_parser)
        OPTIONS = parse(my_parser, sys.argv[1:])
        main_from_parse(OPTIONS)