From 47dba37146b52bdc0d59b7288cc8258fd62b4a39 Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Tue, 20 Aug 2019 14:25:43 +0200
Subject: [PATCH] Add prepare module

---
 PanACoTA/prepare_module/__init__.py           |   1 +
 .../prepare_module/download_genomes_func.py   | 145 +++++++++
 PanACoTA/subcommands/prepare.py               | 296 ++++++++++++++++++
 bin/PanACoTA                                  |  13 +
 setup.py                                      |   2 +-
 5 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100644 PanACoTA/prepare_module/__init__.py
 create mode 100644 PanACoTA/prepare_module/download_genomes_func.py
 create mode 100644 PanACoTA/subcommands/prepare.py

diff --git a/PanACoTA/prepare_module/__init__.py b/PanACoTA/prepare_module/__init__.py
new file mode 100644
index 00000000..09bbfbfc
--- /dev/null
+++ b/PanACoTA/prepare_module/__init__.py
@@ -0,0 +1 @@
+"""prepare module of PanACoTA"""
diff --git a/PanACoTA/prepare_module/download_genomes_func.py b/PanACoTA/prepare_module/download_genomes_func.py
new file mode 100644
index 00000000..0fab78ce
--- /dev/null
+++ b/PanACoTA/prepare_module/download_genomes_func.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+"""
+Functions helping for downloading refseq genomes of a species,
+gunzip them, adding complete genomes...
+
+@author gem
+August 2017
+"""
+import os
+import logging
+import shutil
+import sys
+import glob
+import urllib.request
+import ncbi_genome_download as ngd
+
+from PanACoTA import utils
+
+
+logger = logging.getLogger("ddg.log_dds")
+
+
+def download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads):
+    """
+    Download refseq genomes of given species
+    """
+    # arguments needed to download a species genomes
+    keyargs = {"section": "refseq", "file_format": "fasta", "output": outdir,
+               "parallel": threads, "group": "bacteria",
+               "species_taxid": NCBI_taxid}
+    # summary file could not be downloaded because given species does not match
+    #  any NCBI species. Just download genomes with the given taxID
+    if not sum_file:
+        logger.info("Downloading refseq genomes for taxid={}".format(NCBI_taxid))
+    else:
+        with open(sum_file, "r") as sum_lines:
+            for line in sum_lines:
+                infos = line.split()
+                if len(infos)>=6:
+                    try:
+                        number = int(infos[6])
+                    except ValueError:
+                        continue
+                if number != int(NCBI_taxid):
+                    logger.error("Your NCBI_taxid ({}) does not match with your provided NCBI "
+                                 "species ({}). The NCBI_taxid for this species is "
+                                 "{}".format(NCBI_taxid, NCBI_species, infos[6]))
+                    sys.exit(1)
+        keyargs["genus"] = NCBI_species
+        logger.info("Downloading refseq genomes for {} (taxid={})".format(NCBI_species,
+                                                                          NCBI_taxid))
+    max_retries = 15
+
+    try:
+        ret = ngd.download(**keyargs)
+    except:
+        logger.error("Could not download species taxID {}. Check that you gave the good "
+                     "one.".format(NCBI_taxid))
+        sys.exit(1)
+    attempts = 0
+    while ret == 75 and attempts < max_retries:
+        attempts += 1
+        logging.error(('Downloading from NCBI failed due to a connection error, '
+                       'retrying. Already retried so far: %s'), attempts)
+        ret = ngd.download(**keyargs)
+    nb_gen, db_dir = to_database(outdir)
+    logger.info("Downloaded {} genomes.".format(nb_gen))
+    return db_dir
+
+
+def download_summary(species_linked, outdir):
+    """
+    Get assembly_summary file for the given species if it exists. To be able to download it,
+    the given NCBI species name must be exalctly as the name given on NCBI website.
+
+    species_linked : given NCBI species with '_' instead of spaces, or NCBI taxID if species
+    name not given (then, assembly file won't be found
+
+    outdir: directory where downloaded assembly file must be saved
+    """
+    logger.info("Retrieving assembly_summary file for {}".format(species_linked))
+    url = ("ftp://ftp.ncbi.nih.gov/genomes/refseq/"
+           "bacteria/{}/assembly_summary.txt").format(species_linked)
+    outfile = os.path.join(outdir, "assembly_summary-{}.txt".format(species_linked))
+    try:
+        urllib.request.urlretrieve(url, outfile)
+    except:
+        logger.warning("assembly_summary file cannot be downloaded. Please check that you "
+                       "provided the exact species name, as given in NCBI")
+        return
+    return outfile
+
+
+def to_database(outdir):
+    """
+    Move .fna.gz files to 'database_init' folder, and uncompress them.
+
+    outdir : directory where all results are (for now, refseq folders, assembly summary and log
+
+    Return:
+        nb_gen : number of genomes downloaded
+        db_dir : directory where are all fna files downloaded from refseq
+    """
+    # Unzip fasta files and put to a same folder
+    logger.info("Uncompressing genome files.")
+    db_dir = os.path.join(outdir, "Database_init")
+    download_dir = os.path.join(outdir, "refseq", "bacteria")
+    os.makedirs(db_dir, exist_ok=True)
+    nb_gen = 0
+    for g_folder in os.listdir(download_dir):
+        fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz"))
+        if len(fasta) == 0:
+            logger.warning("Problem with genome {}: no fasta file downloaded.".format(g_folder))
+            continue
+        elif len(fasta) > 1:
+            logger.warning("Problem with genome {}: several fasta files found.".format(g_folder))
+            continue
+        nb_gen += 1
+        fasta_file = os.path.basename(fasta[0])
+        fasta_out = os.path.join(db_dir, fasta_file)
+        shutil.copy(fasta[0], fasta_out)
+        cmd = "gunzip {} -f".format(fasta_out)
+        error = "Error while trying to uncompress {}".format(fasta_out)
+        utils.run_cmd(cmd, error)
+    return nb_gen, db_dir
+
+
+def get_by_genome(all_list):
+    """
+    Given the list of replicons to download, sort them by genome, in order to directly
+    concatenate all replicons from a same genome in the same file.
+    """
+    to_download = {}
+    for file in all_list:
+        filename = os.path.basename(file)
+        strain = ".".join(filename.split(".")[:3])
+        if strain in to_download:
+            to_download[strain].append(file)
+        else:
+            to_download[strain] = [file]
+    return to_download
+
+
+
diff --git a/PanACoTA/subcommands/prepare.py b/PanACoTA/subcommands/prepare.py
new file mode 100644
index 00000000..07c42211
--- /dev/null
+++ b/PanACoTA/subcommands/prepare.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+"""
+Subcommand to prepare a dataset:
+
+- Download all genomes of a given species from refseq
+- Filter them with L90 and number of contigs thresholds
+- Remove too close/far genomes using Mash
+
+
+@author Amandine PERRIN
+August 2019
+"""
+import os
+import sys
+import logging
+
+from PanACoTA import utils
+from PanACoTA.prepare_module import download_genomes_func as dgf
+
+def main_from_parse(arguments):
+    """
+    Call main function from the arguments given by parser
+
+    Parameters
+    ----------
+    arguments : argparse.Namespace
+        result of argparse parsing of all arguments in command line
+
+    """
+    cmd = "PanACoTA " + ' '.join(arguments.argv)
+    main(cmd, arguments.NCBI_species, arguments.NCBI_species_taxid, arguments.outdir,
+         arguments.parallel, arguments.no_refseq, arguments.only_mash,
+         arguments.min_dist, arguments.verbose, arguments.quiet)
+
+
+def main(cmd, NCBI_species, NCBI_species_taxid, outdir, threads, no_refseq, only_mash, min_dist,
+         verbose, quiet):
+    """
+    Main method, constructing the draft dataset for the given species
+
+    verbosity:
+    - defaut 0 : stdout contains INFO, stderr contains ERROR.
+    - 1: stdout contains INFO, stderr contains WARNING and ERROR
+    - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
+    - >=15: Add DEBUG in stdout
+
+    Parameters
+    ----------
+    cmd : str
+        command line used to launch this program
+    NCBI_species : str
+        name of species to download, as given by NCBI
+    NCBI_species_taxid : int
+        species taxid given in NCBI
+    outdir : str
+        path to output directory (where created database will be saved).
+    threads : int
+        max number of threads to use
+    no_refseq : bool
+        True if user does not want to download again the database
+    only_mash : bool
+        True if user user already has the database and quality of each genome (L90, #contigs etc.)
+    min_dist : int
+        lower limit of distance between 2 genomes to keep them
+    verbose : int
+        verbosity:
+        default (0): info in stdout, error and more in stderr
+        1 = add warnings in stderr
+        2 = like 1 + add DETAIL to stdout (by default only INFO)
+        >15: add debug to stdout
+    quiet : bool
+        True if nothing must be sent to stdout/stderr, False otherwise
+
+    """
+    # Fixed limits. For now, we do not propose to user to give its own limits
+    max_dist = 0.06
+    max_l90 = 100
+    max_cont = 999
+
+    # get species name in NCBI format
+    if NCBI_species:
+        NCBI_species = NCBI_species.capitalize()
+        species_linked = "_".join(NCBI_species.split())
+        species_linked = "_".join(species_linked.split("/"))
+
+    # if species name not given by user, use taxID instead to name files
+    else:
+        species_linked = str(NCBI_species_taxid)
+    outdir = os.path.join(outdir, species_linked)
+    db_dir = None
+    os.makedirs(outdir, exist_ok=True)
+
+    # Initialize logger
+    # set level of logger: level is the minimum level that will be considered.
+    if verbose <= 1:
+        level = logging.INFO
+    # for verbose = 2, ignore only debug
+    if verbose >= 2 and verbose < 15:
+        level = utils.detail_lvl() # int corresponding to detail level
+    # for verbose >= 15, write everything
+    if verbose >= 15:
+        level = logging.DEBUG
+    logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked)
+    logfile_base = utils.init_logger(logfile_base, level, name='', details=True,
+                                     verbose=verbose, quiet=quiet)
+
+    # Start prepare step
+    logger = logging.getLogger('')
+    logger.info("Command used\n \t > " + cmd)
+    logger.info("'PanACoTA prepare' Will run on {} cores".format(threads))
+    sys.exit(1)
+
+    # Run more than only mash filter
+    if not only_mash:
+        if norefseq:   # Do not download genomes, just do QC and mash filter on given genomes
+            logger.info('You asked to skip refseq downloads.')
+
+        else:  # Do all steps: download, QC, mash filter
+            sum_file = dgf.download_summary(species_linked, outdir)
+            db_dir = dgf.download_from_refseq(sum_file, NCBI_species, NCBI_taxid, outdir, threads)
+
+        # If refseq genomes already downloaded but not in the database_init folder,
+        # put them in it.
+        if not db_dir:  # if norefseq and not already created before, db_dir can be missing
+            db_dir = os.path.join(outdir, "Database_init")
+            if norefseq and not os.path.exists(db_dir):
+                # add genomes from refseq/bacteria folder to Database_init
+                nb_gen, _ = dgf.to_database(outdir)
+                logger.info("{} refseq genomes downloaded".format(nb_gen))
+        genomes = fg.check_quality(outdir, species_linked, db_dir, max_l90, max_cont)
+    # # Do only mash filter. Genomes must be alreday downloaded, and there must be a file with
+    # # all information on these genomes (L90 etc.)
+    else:
+        info_file = os.path.join(outdir, "info-genomes-list-{}.lst".format(species_linked))
+        if not os.path.exists(info_file):  # info-file missing -> error and exit
+            logger.error(("You do not have the file called {} with all information about "
+                          "genomes. Provide it with the right name, or remove the '--mash' "
+                          "option to rerun quality control.".format(info_file)))
+            sys.exit(1)
+        logger.info(("You want to rerun only mash steps. Getting information "
+                     "from {}").format(info_file))
+        genomes = utils.get_info_genomes(info_file, species_linked)
+
+    # # Run Mash
+    sorted_genomes = fg.sort_genomes_minhash(genomes, max_l90, max_cont)
+    removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked,
+                                 min_dist, max_dist, threads)
+
+    # Write list of genomes kept, and list of genomes removed
+    fg.write_outputfiles(sorted_genomes, removed, outdir, species_linked, min_dist)
+    logger.info("End")
+
+
+def build_parser(parser):
+    """
+    Method to create a parser for command-line options
+
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+        parser to configure in order to extract command-line arguments
+    """
+    import multiprocessing
+    import argparse
+    from PanACoTA import utils_argparse
+
+    required = parser.add_argument_group('Required arguments')
+    required.add_argument("-t", dest="NCBI_species_taxid", required=True,
+                        help=("Species taxid to download, corresponding to the "
+                               "'species taxid' provided by the NCBI")
+                        )
+
+    optional = parser.add_argument_group('Optional arguments')
+    optional.add_argument("-s", dest="NCBI_species",
+                        help=("Species to download, corresponding to the "
+                               "'organism name' provided by the NCBI. Give name given between "
+                               "quotes (for example \"escherichia coli\")")
+                        )
+    optional.add_argument("-o", dest="outdir", default=".",
+                        help=("Give the path to the directory where you want to save the "
+                              "database. In the given diretory, it will create a folder with "
+                              "the gembase species name. Inside this folder, you will find a "
+                              "folder 'Database_init' containing all fasta files, as well as a "
+                              "folder 'refseq' with files downloaded by ncbi_genome_download."))
+    optional.add_argument("-p", dest="parallel", type=utils_argparse.thread_num, default=1,
+                        help=("Run 'N' downloads in parallel (default=1). Put 0 if "
+                              "you want to use all cores of your computer."))
+    optional.add_argument("-r", dest="no_refseq", action="store_true",
+                        help=("If you already downloaded refseq genomes and do not want to "
+                              "check them, add this option to directly go to the next steps:"
+                              "quality control (L90, number of contigs...) and mash filter."))
+    optional.add_argument('-M', '--only-mash', dest="only_mash", action="store_true",
+                        help=("Add this option if you already downloaded complete and refseq "
+                              "genomes, and ran quality control (you have, in your result "
+                              "folder, a file called 'info-genomes-list-<gembase_species>.lst', "
+                              "contaning all genome names, as well as their genome size, "
+                              "number of contigs and L90 values). "
+                              "It will then get information on genomes quality from this "
+                              "file, and run mash steps."))
+    optional.add_argument("-m", dest="min_dist", default=1e-4, type=float,
+                        help="By default, genomes whose distance to the reference is not "
+                             "between 1e-4 and 0.06 are discarded. You can specify your own "
+                             "lower limit (instead of 1e-4) with this option.")
+    helper = parser.add_argument_group('Others')
+    helper.add_argument("-v", "--verbose", dest="verbose", action="count", default=0,
+                        help="Increase verbosity in stdout/stderr.")
+    helper.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
+                        help=("Do not display anything to stdout/stderr. log files will "
+                              "still be created."))
+    helper.add_argument("-h", "--help", dest="help", action="help",
+                        help="show this help message and exit")
+
+
+def parse(parser, argu):
+    """
+    arse arguments given to parser
+
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+        the parser used
+    argu : [str]
+        command-line given by user, to parse using parser
+
+    Returns
+    -------
+    argparse.Namespace
+        Parsed arguments
+    """
+    import argparse
+
+    args = parser.parse_args(argu)
+    return check_args(parser, args)
+
+
+def check_args(parser, args):
+    """
+    Check that arguments given to parser are as expected.
+
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+        The parser used to parse command-line
+    args : argparse.Namespace
+        Parsed arguments
+
+    Returns
+    -------
+    argparse.Namespace or None
+        The arguments parsed, updated according to some rules. Exit program
+        with error message if error occurs with arguments given.
+
+    """
+    from termcolor import colored
+    if not args.NCBI_species:
+        print(colored("WARNING: you did not provide a species name. All files will "
+                      "be downloaded in a folder called with the NCBI species taxid, and "
+                      "you won't be able to get the summary file (showing a summary of all "
+                      "strains downloaded)", "yellow"))
+        yes = ["y", "yes", "Y"]
+        no = ["n", "no", "N"]
+        while True:
+            print("Do you still want to continue? [Y/n] ", end ="")
+            choice = input().lower()
+            if choice in no:
+                sys.exit(0)
+            elif choice in yes or choice == '':
+                return args
+            else:
+                sys.stdout.write("Please answer with 'y' or 'n': ")
+    return True
+
+
+if __name__ == '__main__':
+    import argparse
+    from textwrap import dedent
+    header = '''
+     ___                 _____  ___         _____  _____
+    (  _`\              (  _  )(  _`\      (_   _)(  _  )
+    | |_) )  _ _   ___  | (_) || ( (_)   _   | |  | (_) |
+    | ,__/'/'_` )/' _ `\|  _  || |  _  /'_`\ | |  |  _  |
+    | |   ( (_| || ( ) || | | || (_( )( (_) )| |  | | | |
+    (_)   `\__,_)(_) (_)(_) (_)(____/'`\___/'(_)  (_) (_)
+
+
+       Large scale comparative genomics tools
+
+     -------------------------------------------
+     '''
+    my_parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
+                                        description=dedent(header), add_help=False)
+    build_parser(my_parser)
+    OPTIONS = parse(my_parser, sys.argv[1:])
+    main_from_parse(OPTIONS)
diff --git a/bin/PanACoTA b/bin/PanACoTA
index 86609c4a..4359014b 100755
--- a/bin/PanACoTA
+++ b/bin/PanACoTA
@@ -6,6 +6,7 @@ from textwrap import dedent
 
 from PanACoTA import __version__ as version
 
+from PanACoTA.subcommands import prepare
 from PanACoTA.subcommands import annotate
 from PanACoTA.subcommands import pangenome
 from PanACoTA.subcommands import corepers
@@ -63,6 +64,18 @@ def parse_arguments(argv):
     actions = {}  # to add the action to do according to the subparser called
     checks = {}  # to add the function to call to check the subparser arguments
 
+    # QC and annotation part. Start with ASCII art title, + small description of subcommand
+    parser_prepare = subparsers.add_parser('prepare',
+                                            formatter_class=argparse.RawDescriptionHelpFormatter,
+                                            description=(dedent(header) +
+                                            "\n=> Prepare draft dataset"),
+                                            epilog=footer,
+                                            help="Prepare draft dataset",
+                                            add_help=False)
+    prepare.build_parser(parser_prepare)
+    actions["prepare"] = prepare.main_from_parse
+    checks["prepare"] = prepare.check_args
+
     # QC and annotation part. Start with ASCII art title, + small description of subcommand
     parser_annotate = subparsers.add_parser('annotate',
                                             formatter_class=argparse.RawDescriptionHelpFormatter,
diff --git a/setup.py b/setup.py
index 1e6c4e43..7ce41fa0 100755
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@ def parse_requirements(requirements):
                 and not l.startswith('#')]
 
 
-packages = ['PanACoTA', 'PanACoTA.annotate_module',
+packages = ['PanACoTA', 'PanACoTA.prepare_module', 'PanACoTA.annotate_module',
             'PanACoTA.pangenome_module', 'PanACoTA.corepers_module',
             'PanACoTA.align_module', 'PanACoTA.tree_module', 'PanACoTA.subcommands']
 requires = parse_requirements("requirements.txt")
-- 
GitLab