From d63ca806d0d343aa162373c5afd1d952052d9c3d Mon Sep 17 00:00:00 2001
From: asetGem <amandine.perrin@pasteur.fr>
Date: Wed, 17 Feb 2021 19:05:23 +0100
Subject: [PATCH] add --small option

If we run prodigal on small genomes, it is not possible to train (if <2000nuc). So, instead, just run prodigal, without a training file, using its '-p meta' procedure
---
 Examples/input_files/configfile.ini           |  1 +
 .../annotate_module/annotation_functions.py   | 28 +++++++++++++------
 PanACoTA/subcommands/annotate.py              | 25 ++++++++++++-----
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/Examples/input_files/configfile.ini b/Examples/input_files/configfile.ini
index c481f05f..7c627756 100644
--- a/Examples/input_files/configfile.ini
+++ b/Examples/input_files/configfile.ini
@@ -25,6 +25,7 @@
 #qc_only = False
 #db_path =
 #prodigal_only = False
+#small = False
 #name =
 #date =
 #l90 = 100
diff --git a/PanACoTA/annotate_module/annotation_functions.py b/PanACoTA/annotate_module/annotation_functions.py
index fa62ab7e..81846f0f 100755
--- a/PanACoTA/annotate_module/annotation_functions.py
+++ b/PanACoTA/annotate_module/annotation_functions.py
@@ -58,7 +58,7 @@ logger = logging.getLogger('annotate.run_annotation_all')
 
 
 def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only=False,
-                       quiet=False):
+                       small=False, quiet=False):
     """
     For each genome in genomes, run prokka (or only prodigal) to annotate the genome.
 
@@ -80,6 +80,8 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only
         name (key in genomes dict) of the fist genome, which will be used for prodigal training
     prodigal_only : bool
         True if only prodigal must run, False if prokka must run
+    small : bool
+        True -> use -p meta option with prodigal. Do not use training
     quiet : bool
         True if nothing must be written to stderr/stdout, False otherwise
 
@@ -126,7 +128,10 @@ def run_annotation_all(genomes, threads, force, annot_folder, fgn, prodigal_only
         # If problem, gpath_train will be empty, but this will be checked while
         # trying to run prodigal, because we also need to check that genomes are not simply
         # already annotated
-        gpath_train = prodigal_train(gtrain, annot_folder)
+        if not small:
+            gpath_train = prodigal_train(gtrain, annot_folder)
+        else:
+            gpath_train = "small option"
     elif threads <= 3:
         # less than 3 threads: run prokka 1 by 1 with all threads
         cores_annot = threads
@@ -381,10 +386,13 @@ def run_prodigal(arguments):
         logger.warning("Prodigal results folder already exists, but is removed because "
                        "--force option was used.")
 
-    # If training file does not exist and prodigal result directory neither, return False
+    # Training file can be "small option", meaning that we did not use the training mode.
+    # If not "small option", we used the training mode. If training file does not exist 
+    # and prodigal result directory neither, return False
     # We cannot annotate using nothing.
     # Happens if there was a problem while training
-    if not os.path.isfile(gpath_train) and not os.path.isdir(prodigal_dir):
+    if (gpath_train != "small option" and not os.path.isfile(gpath_train) 
+        and not os.path.isdir(prodigal_dir)):
         return False
 
     logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "
@@ -404,13 +412,13 @@ def run_prodigal(arguments):
                                            "remove this result folder, or use '-F' or '--force' "
                                            "option.".format(prodigal_dir))
 
-            logger.log(utils.detail_lvl(), "End annotating {} (from {})".format(name, gpath))
+            logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
         # If missing files, or other problems in result dir, error message,
         # ask user to force or remove this folder.
         else:
             logger.warning("Problems in the files contained in your already existing output dir "
-                           "({}). Please check it, or remove it to "
-                           "re-annotate.".format(prodigal_dir))
+                           f"({prodigal_dir}). Please check it, or remove it to "
+                           "re-annotate.")
         # If everything was ok -> everything is ready for next step -> return True
         # If something is wrong -> cannot use those results, genome won't be annotated
         # -> return False
@@ -430,8 +438,12 @@ def run_prodigal(arguments):
     error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")
     prodigalf = open(prodigal_logfile, "w")
     prodigalferr = open(prodigal_logfile_err, "w")
+    if gpath_train == "small option":
+        training = "-p meta"
+    else:
+        training = f"-t {gpath_train}"
     cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "
-           f"-f gff -o {basic_outname + '.gff'} -t {gpath_train} -q")
+           f"-f gff -o {basic_outname + '.gff'} {training} -q")
     logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)
 
     ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
diff --git a/PanACoTA/subcommands/annotate.py b/PanACoTA/subcommands/annotate.py
index 7e0589ad..4b9776ad 100755
--- a/PanACoTA/subcommands/annotate.py
+++ b/PanACoTA/subcommands/annotate.py
@@ -151,12 +151,13 @@ def main_from_parse(arguments):
          arguments.name,
          arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads,
          arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir,
-         arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only)
+         arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only,
+         arguments.small)
 
 
 def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5,
          threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None,
-         verbose=0, quiet=False, prodigal_only=False):
+         verbose=0, quiet=False, prodigal_only=False, small=False):
     """
     Main method, doing all steps:
 
@@ -224,6 +225,8 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
         True if nothing must be sent to stdout/stderr, False otherwise
     prodigal_only : bool
         True -> run only prodigal. False -> run prokka
+    small : bool
+        True -> use -p meta option with prodigal
 
     Returns
     -------
@@ -381,7 +384,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
 
     # STEP 4. Annotate all kept genomes
     results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, first_gname,
-                                       prodigal_only, quiet=quiet)
+                                       prodigal_only, small=small, quiet=quiet)
     # Information on genomes to format
     # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase,
     #               gsize, nbcont, L90]}
@@ -470,6 +473,10 @@ def build_parser(parser):
                           help="Add this option if you only want syntactical annotation, given "
                                "by prodigal, and not functional annotation requiring prokka and "
                                "is slower.")
+    optional.add_argument("--small", dest="small", action="store_true", default=False,
+                          help="If you use Prodigal to annotate genomes, if you sequences are "
+                               "too small (less than 20000 characters), it cannot annotate them "
+                               "with the default options. Add this option to use 'meta' procedure.")
     optional.add_argument("--l90", dest="l90", type=int, default=100,
                           help="Maximum value of L90 allowed to keep a genome. Default is 100.")
     optional.add_argument("--nbcont", dest="nbcont", type=utils_argparse.cont_num, default=999,
@@ -582,6 +589,10 @@ def check_args(parser, args):
         return split + trust
 
     #  ERRORS
+    # Cannot be verbose and quiet at the same time
+    if args.verbose > 0 and args.quiet:
+        parser.error("Choose between a verbose output (-v) or a quiet output (-q)."
+                     " You cannot have both.")
     # User wants to run all annotation step: needs a genome dataset name
     if not args.qc_only and not args.name:
         parser.error("You must specify your genomes dataset name in 4 characters with "
@@ -591,10 +602,10 @@ def check_args(parser, args):
     # If QC only, we do not need name -> name is NONE
     if args.qc_only and not args.name:
         args.name = "NONE"
-    # Cannot be verbose and quiet at the same time
-    if args.verbose > 0 and args.quiet:
-        parser.error("Choose between a verbose output (-v) or a quiet output (-q)."
-                     " You cannot have both.")
+    # option --small used only with prodigal
+    if not args.prodigal_only and args.small:
+        parser.error("You cannot use --small option with prokka. Either use prodigal, "
+                     "or remove this option.")
     # If user specifies a cutN value (different than default one which is 5), and give
     # an info file, it is not compatible: info file will use sequences as is, and won't cut them
     if args.cutn != 5 and args.from_info:
-- 
GitLab