Skip to content
Snippets Groups Projects
Commit d63ca806 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

add --small option

If we run prodigal on small genomes, it is not possible to train (if <2000nuc). So, instead, just run prodigal, without a training file, using its '-p meta' procedure
parent 0e1e9ba5
No related branches found
No related tags found
No related merge requests found
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#qc_only = False #qc_only = False
#db_path = #db_path =
#prodigal_only = False #prodigal_only = False
#small = False
#name = #name =
#date = #date =
#l90 = 100 #l90 = 100
......
...@@ -58,7 +58,7 @@ logger = logging.getLogger('annotate.run_annotation_all') ...@@ -58,7 +58,7 @@ logger = logging.getLogger('annotate.run_annotation_all')
def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False, def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False,
quiet=False): small=False, quiet=False):
""" """
For each genome in genomes, run prokka (or only prodigal) to annotate the genome. For each genome in genomes, run prokka (or only prodigal) to annotate the genome.
...@@ -80,6 +80,8 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only ...@@ -80,6 +80,8 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only
name (key in genomes dict) of the fist genome, which will be used for prodigal training name (key in genomes dict) of the fist genome, which will be used for prodigal training
prodigal_only : bool prodigal_only : bool
True if only prodigal must run, False if prokka must run True if only prodigal must run, False if prokka must run
small : bool
True -> use -p meta option with prodigal. Do not use training
quiet : bool quiet : bool
True if nothing must be written to stderr/stdout, False otherwise True if nothing must be written to stderr/stdout, False otherwise
...@@ -126,7 +128,10 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only ...@@ -126,7 +128,10 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only
# If problem, gpath_train will be empty, but this will be checked while # If problem, gpath_train will be empty, but this will be checked while
# trying to run prodigal, because we also need to check that genomes are not simply # trying to run prodigal, because we also need to check that genomes are not simply
# already annotated # already annotated
if not small:
gpath_train = prodigal_train(gtrain, annot_folder) gpath_train = prodigal_train(gtrain, annot_folder)
else:
gpath_train = "small option"
elif threads <= 3: elif threads <= 3:
# less than 3 threads: run prokka 1 by 1 with all threads # less than 3 threads: run prokka 1 by 1 with all threads
cores_annot = threads cores_annot = threads
...@@ -381,10 +386,13 @@ def run_prodigal(arguments): ...@@ -381,10 +386,13 @@ def run_prodigal(arguments):
logger.warning("Prodigal results folder already exists, but is removed because " logger.warning("Prodigal results folder already exists, but is removed because "
"--force option was used.") "--force option was used.")
# If training file does not exist and prodigal result directory neither, return False # Training file can be "small option", meaning that we did not use the training mode.
# If not "small option", we used the training mode. If training file does not exist
# and prodigal result directory neither, return False
# We cannot annotate using nothing. # We cannot annotate using nothing.
# Happens if there was a problem while training # Happens if there was a problem while training
if not os.path.isfile(gpath_train) and not os.path.isdir(prodigal_dir): if (gpath_train != "small option" and not os.path.isfile(gpath_train)
and not os.path.isdir(prodigal_dir)):
return False return False
logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) " logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "
...@@ -404,13 +412,13 @@ def run_prodigal(arguments): ...@@ -404,13 +412,13 @@ def run_prodigal(arguments):
"remove this result folder, or use '-F' or '--force' " "remove this result folder, or use '-F' or '--force' "
"option.".format(prodigal_dir)) "option.".format(prodigal_dir))
logger.log(utils.detail_lvl(), "End annotating {} (from {})".format(name, gpath)) logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
# If missing files, or other problems in result dir, error message, # If missing files, or other problems in result dir, error message,
# ask user to force or remove this folder. # ask user to force or remove this folder.
else: else:
logger.warning("Problems in the files contained in your already existing output dir " logger.warning("Problems in the files contained in your already existing output dir "
"({}). Please check it, or remove it to " f"({prodigal_dir}). Please check it, or remove it to "
"re-annotate.".format(prodigal_dir)) "re-annotate.")
# If everything was ok -> everything is ready for next step -> return True # If everything was ok -> everything is ready for next step -> return True
# If something is wrong -> cannot use those results, genome won't be annotated # If something is wrong -> cannot use those results, genome won't be annotated
# -> return False # -> return False
...@@ -430,8 +438,12 @@ def run_prodigal(arguments): ...@@ -430,8 +438,12 @@ def run_prodigal(arguments):
error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.") error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")
prodigalf = open(prodigal_logfile, "w") prodigalf = open(prodigal_logfile, "w")
prodigalferr = open(prodigal_logfile_err, "w") prodigalferr = open(prodigal_logfile_err, "w")
if gpath_train == "small option":
training = "-p meta"
else:
training = f"-t {gpath_train}"
cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} " cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "
f"-f gff -o {basic_outname + '.gff'} -t {gpath_train} -q") f"-f gff -o {basic_outname + '.gff'} {training} -q")
logger.log(utils.detail_lvl(), "Prodigal command: " + cmd) logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)
ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
......
...@@ -151,12 +151,13 @@ def main_from_parse(arguments): ...@@ -151,12 +151,13 @@ def main_from_parse(arguments):
arguments.name, arguments.name,
arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads, arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads,
arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir, arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir,
arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only) arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only,
arguments.small)
def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5, def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5,
threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None, threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None,
verbose=0, quiet=False, prodigal_only=False): verbose=0, quiet=False, prodigal_only=False, small=False):
""" """
Main method, doing all steps: Main method, doing all steps:
...@@ -224,6 +225,8 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn ...@@ -224,6 +225,8 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
True if nothing must be sent to stdout/stderr, False otherwise True if nothing must be sent to stdout/stderr, False otherwise
prodigal_only : bool prodigal_only : bool
True -> run only prodigal. False -> run prokka True -> run only prodigal. False -> run prokka
small : bool
True -> use -p meta option with prodigal
Returns Returns
------- -------
...@@ -381,7 +384,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn ...@@ -381,7 +384,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
# STEP 4. Annotate all kept genomes # STEP 4. Annotate all kept genomes
results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, first_gname, results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, first_gname,
prodigal_only, quiet=quiet) prodigal_only, small=small, quiet=quiet)
# Information on genomes to format # Information on genomes to format
# results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase, # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase,
# gsize, nbcont, L90]} # gsize, nbcont, L90]}
...@@ -470,6 +473,10 @@ def build_parser(parser): ...@@ -470,6 +473,10 @@ def build_parser(parser):
help="Add this option if you only want syntactical annotation, given " help="Add this option if you only want syntactical annotation, given "
"by prodigal, and not functional annotation requiring prokka and " "by prodigal, and not functional annotation requiring prokka and "
"is slower.") "is slower.")
optional.add_argument("--small", dest="small", action="store_true", default=False,
help="If you use Prodigal to annotate genomes, if you sequences are "
"too small (less than 20000 characters), it cannot annotate them "
"with the default options. Add this option to use 'meta' procedure.")
optional.add_argument("--l90", dest="l90", type=int, default=100, optional.add_argument("--l90", dest="l90", type=int, default=100,
help="Maximum value of L90 allowed to keep a genome. Default is 100.") help="Maximum value of L90 allowed to keep a genome. Default is 100.")
optional.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num, default=999, optional.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num, default=999,
...@@ -582,6 +589,10 @@ def check_args(parser, args): ...@@ -582,6 +589,10 @@ def check_args(parser, args):
return split + trust return split + trust
# ERRORS # ERRORS
# Cannot be verbose and quiet at the same time
if args.verbose > 0 and args.quiet:
parser.error("Choose between a verbose output (-v) or a quiet output (-q)."
" You cannot have both.")
# User wants to run all annotation step: needs a genome dataset name # User wants to run all annotation step: needs a genome dataset name
if not args.qc_only and not args.name: if not args.qc_only and not args.name:
parser.error("You must specify your genomes dataset name in 4 characters with " parser.error("You must specify your genomes dataset name in 4 characters with "
...@@ -591,10 +602,10 @@ def check_args(parser, args): ...@@ -591,10 +602,10 @@ def check_args(parser, args):
# If QC only, we do not need name -> name is NONE # If QC only, we do not need name -> name is NONE
if args.qc_only and not args.name: if args.qc_only and not args.name:
args.name = "NONE" args.name = "NONE"
# Cannot be verbose and quiet at the same time # option --small used only with prodigal
if args.verbose > 0 and args.quiet: if not args.prodigal_only and args.small:
parser.error("Choose between a verbose output (-v) or a quiet output (-q)." parser.error("You cannot use --small option with prokka. Either use prodigal, "
" You cannot have both.") "or remove this option.")
# If user specifies a cutN value (different than default one which is 5), and give # If user specifies a cutN value (different than default one which is 5), and give
# an info file, it is not compatible: info file will use sequences as is, and won't cut them # an info file, it is not compatible: info file will use sequences as is, and won't cut them
if args.cutn != 5 and args.from_info: if args.cutn != 5 and args.from_info:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment