diff --git a/PanACoTA/subcommands/prepare.py b/PanACoTA/subcommands/prepare.py
index a79f5c48cc580a4cbe07a9f88bbfcf7fb0ecc8df..bb8d5f98b55383cc7b37cb9959cb224fc794965a 100644
--- a/PanACoTA/subcommands/prepare.py
+++ b/PanACoTA/subcommands/prepare.py
@@ -34,13 +34,13 @@ def main_from_parse(arguments):
cmd = "PanACoTA " + ' '.join(arguments.argv)
main(cmd, arguments.NCBI_species, arguments.NCBI_species_taxid, arguments.outdir,
arguments.tmp_dir, arguments.parallel, arguments.no_refseq, arguments.only_mash,
- arguments.l90, arguments.nbcont, arguments.cutn, arguments.min_dist,
+ arguments.from_info, arguments.l90, arguments.nbcont, arguments.cutn, arguments.min_dist,
arguments.verbose, arguments.quiet)
-def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, only_mash, l90,
- nbcont, cutn, min_dist, verbose, quiet):
+def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, only_mash, info_file,
+ l90, nbcont, cutn, min_dist, verbose, quiet):
"""
Main method, constructing the draft dataset for the given species
@@ -67,6 +67,9 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, onl
True if user does not want to download again the database
only_mash : bool
True if user user already has the database and quality of each genome (L90, #contigs etc.)
+ info_file : str
+ File containing information on QC if it was already ran before (columns to_annotate,
+ gsize, nb_conts and L90).
l90 : int
Max L90 allowed to keep a genome
nbcont : int
@@ -77,10 +80,12 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, onl
lower limit of distance between 2 genomes to keep them
verbose : int
verbosity:
- - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more
+ - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more,
+ .log.err contains warning and more
- 1: same as 0 + WARNING in stderr
- 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
- - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug
+ - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything
+ from info to debug
quiet : bool
True if nothing must be sent to stdout/stderr, False otherwise
@@ -138,6 +143,11 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, onl
# - start from QC and mash (norefseq)
# - start from genome download (!norefseq))
if not only_mash:
+ # Not only mash, so a new info file will be created. If the user still gave an info
+ # file (he will be warned that it will be ignored), rename it with '.bak'
+ # to avoid erasing it
+ if info_file:
+ os.rename(info_file, info_file + ".back")
# 'no_refseq = True" : Do not download genomes, just do QC and mash filter on given genomes
# (sequences must, at least, be in outdir/refeq/bacteria/<genome_name>.fna.gz)
# (they can also be in Database_init/<genome_name>.fna)
@@ -180,15 +190,14 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, onl
# Do only mash filter. Genomes must be already downloaded, and there must be a file with
# all information on these genomes (L90 etc.)
else:
- info_file = os.path.join(outdir, "info-genomes-list-{}.lst".format(species_linked))
if not os.path.exists(info_file): # info-file missing -> error and exit
- logger.error(("You do not have the file called {} with all information about "
- "genomes. Provide it with the right name, or remove the '--mash' "
- "option to rerun quality control.".format(info_file)))
+ logger.error(f"Your info file {info_file} does not exist. Please Provide the "
+ "right name/path, or remove the '--mash-only option to rerun "
+ "quality control.")
sys.exit(1)
logger.info(("You want to rerun only mash steps. Getting information "
"from {}").format(info_file))
- genomes = utils.get_info_genomes(info_file, species_linked)
+ genomes = utils.read_genomes_info(info_file, species_linked, )
# Run Mash
# genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size,
@@ -251,6 +260,14 @@ def build_parser(parser):
"number of contigs and L90 values). "
"It will then get information on genomes quality from this "
"file, and run mash steps."))
+ optional.add_argument("--info", dest="from_info",
+ help="If you already ran the 'prepare' data module, or already "
+ "calculated yourself the size, L90 and number of contigs for each "
+ "genome, you can give this information, to go directly to "
+ "Mash filtering step. This file contains at "
+ "least 4 columns, tab separated, with the following headers: "
+ "'to_annotate', 'gsize', 'nb_conts', 'L90'. Any other column "
+ "will be ignored.")
optional.add_argument("-m", dest="min_dist", default=1e-4, type=float,
help="By default, genomes whose distance to the reference is not "
"between 1e-4 and 0.06 are discarded. You can specify your own "
@@ -334,6 +351,11 @@ def check_args(parser, args):
if not args.NCBI_species_taxid and not args.NCBI_species:
parser.error("Give at least an NCBI species name or taxID.")
+ # If user wants only mash steps, check that he gave info file
+ if args.only_mash and not args.from_info:
+ parser.error("If you want to run only Mash filtering steps, please give the "
+ "info file with the required information (see '--info' option")
+
# WARNINGS
# User did not specify a species name
if not args.NCBI_species:
@@ -351,6 +373,13 @@ def check_args(parser, args):
if args.l90 == 100 or args.nbcont == 999:
print(colored(thresholds_message(args.l90, args.nbcont), "yellow"))
+ # Warn if user gave info file, but does not ask to run only Mash -> info file will be ignored
+ if args.from_info and not args.only_mash:
+ message = ("You gave an info file (--info option), but did not ask to run only Mash "
+ "step (-M option). Your info file will be ignored (and renamed with '.back' "
+ "at the end), and another one will "
+ "be created with the new calculated values.")
+ print(colored(message))
return args