From d63ca806d0d343aa162373c5afd1d952052d9c3d Mon Sep 17 00:00:00 2001 From: asetGem <amandine.perrin@pasteur.fr> Date: Wed, 17 Feb 2021 19:05:23 +0100 Subject: [PATCH] add --small option If we run prodigal on small genomes, it is not possible to train (if <2000nuc). So, instead, just run prodigal, without a training file, using its '-p meta' procedure --- Examples/input_files/configfile.ini | 1 + .../annotate_module/annotation_functions.py | 28 +++++++++++++------ PanACoTA/subcommands/annotate.py | 25 ++++++++++++----- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/Examples/input_files/configfile.ini b/Examples/input_files/configfile.ini index c481f05f..7c627756 100644 --- a/Examples/input_files/configfile.ini +++ b/Examples/input_files/configfile.ini @@ -25,6 +25,7 @@ #qc_only = False #db_path = #prodigal_only = False +#small = False #name = #date = #l90 = 100 diff --git a/PanACoTA/annotate_module/annotation_functions.py b/PanACoTA/annotate_module/annotation_functions.py index fa62ab7e..81846f0f 100755 --- a/PanACoTA/annotate_module/annotation_functions.py +++ b/PanACoTA/annotate_module/annotation_functions.py @@ -58,7 +58,7 @@ logger = logging.getLogger('annotate.run_annotation_all') def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False, - quiet=False): + small=False, quiet=False): """ For each genome in genomes, run prokka (or only prodigal) to annotate the genome. @@ -80,6 +80,8 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only name (key in genomes dict) of the fist genome, which will be used for prodigal training prodigal_only : bool True if only prodigal must run, False if prokka must run + small : bool + True -> use -p meta option with prodigal. Do not use training quiet : bool True if nothing must be written to stderr/stdout, False otherwise @@ -126,7 +128,10 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only # If problem, gpath_train will be empty, but this will be checked while # trying to run prodigal, because we also need to check that genomes are not simply # already annotated - gpath_train = prodigal_train(gtrain, annot_folder) + if not small: + gpath_train = prodigal_train(gtrain, annot_folder) + else: + gpath_train = "small option" elif threads <= 3: # less than 3 threads: run prokka 1 by 1 with all threads cores_annot = threads @@ -381,10 +386,13 @@ def run_prodigal(arguments): logger.warning("Prodigal results folder already exists, but is removed because " "--force option was used.") - # If training file does not exist and prodigal result directory neither, return False + # Training file can be "small option", meaning that we did not use the training mode. + # If not "small option", we used the training mode. If training file does not exist + # and prodigal result directory neither, return False # We cannot annotate using nothing. # Happens if there was a problem while training - if not os.path.isfile(gpath_train) and not os.path.isdir(prodigal_dir): + if (gpath_train != "small option" and not os.path.isfile(gpath_train) + and not os.path.isdir(prodigal_dir)): return False logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) " @@ -404,13 +412,13 @@ def run_prodigal(arguments): "remove this result folder, or use '-F' or '--force' " "option.".format(prodigal_dir)) - logger.log(utils.detail_lvl(), "End annotating {} (from {})".format(name, gpath)) + logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") # If missing files, or other problems in result dir, error message, # ask user to force or remove this folder. else: logger.warning("Problems in the files contained in your already existing output dir " - "({}). Please check it, or remove it to " - "re-annotate.".format(prodigal_dir)) + f"({prodigal_dir}). Please check it, or remove it to " + "re-annotate.") # If everything was ok -> everything is ready for next step -> return True # If something is wrong -> cannot use those results, genome won't be annotated # -> return False @@ -430,8 +438,12 @@ def run_prodigal(arguments): error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.") prodigalf = open(prodigal_logfile, "w") prodigalferr = open(prodigal_logfile_err, "w") + if gpath_train == "small option": + training = "-p meta" + else: + training = f"-t {gpath_train}" cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} " - f"-f gff -o {basic_outname + '.gff'} -t {gpath_train} -q") + f"-f gff -o {basic_outname + '.gff'} {training} -q") logger.log(utils.detail_lvl(), "Prodigal command: " + cmd) ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, diff --git a/PanACoTA/subcommands/annotate.py b/PanACoTA/subcommands/annotate.py index 7e0589ad..4b9776ad 100755 --- a/PanACoTA/subcommands/annotate.py +++ b/PanACoTA/subcommands/annotate.py @@ -151,12 +151,13 @@ def main_from_parse(arguments): arguments.name, arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads, arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir, - arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only) + arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only, + arguments.small) def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5, threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None, - verbose=0, quiet=False, prodigal_only=False): + verbose=0, quiet=False, prodigal_only=False, small=False): """ Main method, doing all steps: @@ -224,6 +225,8 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn True if nothing must be sent to stdout/stderr, False otherwise prodigal_only : bool True -> run only prodigal. False -> run prokka + small : bool + True -> use -p meta option with prodigal Returns ------- @@ -381,7 +384,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn # STEP 4. Annotate all kept genomes results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, first_gname, - prodigal_only, quiet=quiet) + prodigal_only, small=small, quiet=quiet) # Information on genomes to format # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase, # gsize, nbcont, L90]} @@ -470,6 +473,10 @@ def build_parser(parser): help="Add this option if you only want syntactical annotation, given " "by prodigal, and not functional annotation requiring prokka and " "is slower.") + optional.add_argument("--small", dest="small", action="store_true", default=False, + help="If you use Prodigal to annotate genomes, if you sequences are " + "too small (less than 20000 characters), it cannot annotate them " + "with the default options. Add this option to use 'meta' procedure.") optional.add_argument("--l90", dest="l90", type=int, default=100, help="Maximum value of L90 allowed to keep a genome. Default is 100.") optional.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num, default=999, @@ -582,6 +589,10 @@ def check_args(parser, args): return split + trust # ERRORS + # Cannot be verbose and quiet at the same time + if args.verbose > 0 and args.quiet: + parser.error("Choose between a verbose output (-v) or a quiet output (-q)." + " You cannot have both.") # User wants to run all annotation step: needs a genome dataset name if not args.qc_only and not args.name: parser.error("You must specify your genomes dataset name in 4 characters with " @@ -591,10 +602,10 @@ def check_args(parser, args): # If QC only, we do not need name -> name is NONE if args.qc_only and not args.name: args.name = "NONE" - # Cannot be verbose and quiet at the same time - if args.verbose > 0 and args.quiet: - parser.error("Choose between a verbose output (-v) or a quiet output (-q)." - " You cannot have both.") + # option --small used only with prodigal + if not args.prodigal_only and args.small: + parser.error("You cannot use --small option with prokka. Either use prodigal, " + "or remove this option.") # If user specifies a cutN value (different than default one which is 5), and give # an info file, it is not compatible: info file will use sequences as is, and won't cut them if args.cutn != 5 and args.from_info: -- GitLab