From c29a701dbc5fa1ca152ab54afe043e3d7dfda9fe Mon Sep 17 00:00:00 2001
From: Amandine PERRIN <amandine.perrin@pasteur.fr>
Date: Thu, 5 Sep 2019 09:57:34 +0200
Subject: [PATCH] prepare module working

Can be launched:
 - from scratch
 - from already downloaded genomes
 - from already QC done (only mash)

 TODO: tests !!!"
---
 PanACoTA/subcommands/prepare.py | 10 ++--
 PanACoTA/utils.py               | 83 ++++++++++++++-------------------
 2 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/PanACoTA/subcommands/prepare.py b/PanACoTA/subcommands/prepare.py
index bb8d5f98..1f91485c 100644
--- a/PanACoTA/subcommands/prepare.py
+++ b/PanACoTA/subcommands/prepare.py
@@ -190,15 +190,15 @@ def main(cmd, NCBI_species, NCBI_taxid, outdir, tmp_dir, threads, no_refseq, onl
     # Do only mash filter. Genomes must be already downloaded, and there must be a file with
     # all information on these genomes (L90 etc.)
     else:
+        logger.warning('You asked to run only mash steps.')
         if not os.path.exists(info_file):  # info-file missing -> error and exit
             logger.error(f"Your info file {info_file} does not exist. Please Provide the  "
                           "right name/path, or remove the '--mash-only option to rerun "
                           "quality control.")
             sys.exit(1)
-        logger.info(("You want to rerun only mash steps. Getting information "
+        logger.info(("You want to run only mash steps. Getting information "
                      "from {}").format(info_file))
         genomes = utils.read_genomes_info(info_file, species_linked, )
-
     # Run Mash
     # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size,
                              # nbcont, l90]}
@@ -354,7 +354,7 @@ def check_args(parser, args):
     # If user wants only mash steps, check that he gave info file
     if args.only_mash and not args.from_info:
         parser.error("If you want to run only Mash filtering steps, please give the "
-                     "info file with the required information (see '--info' option")
+                     "info file with the required information (see '--info' option)")
 
     # WARNINGS
     # User did not specify a species name
@@ -375,11 +375,11 @@ def check_args(parser, args):
 
     # Warn if user gave info file, but does not ask to run only Mash -> info file will be ignored
     if args.from_info and not args.only_mash:
-        message = ("You gave an info file (--info option), but did not ask to run only Mash "
+        message = ("  !! You gave an info file (--info option), but did not ask to run only Mash "
                    "step (-M option). Your info file will be ignored (and renamed with '.back' "
                    "at the end), and another one will "
                    "be created with the new calculated values.")
-        print(colored(message))
+        print(colored(message, "yellow"))
 
     return args
 
diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py
index 129a83b8..c03408a0 100755
--- a/PanACoTA/utils.py
+++ b/PanACoTA/utils.py
@@ -620,7 +620,7 @@ def read_genomes(list_file, name, date, dbpath, tmp_path):
     dict
         {genome: spegenus.date} spegenus.date = name.date
     """
-    logger = logging.getLogger("utils")
+    logger = logging.getLogger("prepare.utils")
     logger.info("Reading genomes")
     genomes = {}
     # Check that given list file exists
@@ -686,15 +686,13 @@ def read_genomes(list_file, name, date, dbpath, tmp_path):
     return genomes
 
 
-def read_genomes_info(list_file, name, date, dbpath, dbpath2):
+def read_genomes_info(list_file, name, date=None, logger=None):
     """
     Read a lstinfo file containing the list of genomes with information (L90, genome size etc.).
     1 line per genome, 4 required columns (Others will just be ignored):
     to_annotate gsize nb_conts L90
 
-    Check that the given genome file (to_annotate column) exists in dbpath or in dbpath2
-    (files can be split into 2 different folders). If in none of those 2 folders, put a
-    warning message and ignore this file.
+    Check that the given genome file (to_annotate column) exists.
 
     Parameters
     ----------
@@ -704,13 +702,9 @@ def read_genomes_info(list_file, name, date, dbpath, dbpath2):
         Default species name
     date : str
         Default date
-    dbpath : str
-        path to folder containing genome files to annotate
-    dbpath2 : str
-        path to other folder which can contain the genome files to annotate. For example,
-        if it comes from a previous run of 'PanACoTA annotate' where sequences needed to be
-        modified to be ready for annotation (for example, merging several contig files in one
-        file, split at each stretch of 5 'N', etc.).
+    logger : logging.Logger
+        logger object to write log information
+
 
     Returns
     -------
@@ -719,17 +713,21 @@ def read_genomes_info(list_file, name, date, dbpath, dbpath2):
                    [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
                   }
     """
-    logger = logging.getLogger("utils")
+    if not logger:
+        logger = logging.getLogger("prepare.utils")
     logger.info(f"Reading given information on your genomes in {list_file}")
     genomes = {}
-    spegenus = "{}.{}".format(name, date)
+    if name and date:
+        spegenus = "{}.{}".format(name, date)
     column_order = {} # Put the number of column corresponding to each field
     if not os.path.isfile(list_file):
-        logger.error(("ERROR: You used the '--info <filename>' option, meaning that <filename> "
-                      "contains information on each sequence provided (L90, nb contigs etc.). "
-                      "However, the provided genome information file '{}' does not exist. "
-                      "Please provide a listinfo file.\n Ending program.").format(list_file))
+        logger.error(f"ERROR: The info file {list_file} that you gave does not exist. "
+                      "Please provide the  right path/name for this file.\nEnding program.")
         sys.exit(1)
+    message_no_header = (f"ERROR: It seems that your info file {list_file} does not have a "
+                          "header, or this header does not have, at least, the required "
+                          "columns tab separated: to_annotate, gsize nb_conts and L90 (in any "
+                          "order).\nEnding program.")
     with open(list_file, "r") as lff:
         for line in lff:
             line = line.strip()
@@ -740,16 +738,12 @@ def read_genomes_info(list_file, name, date, dbpath, dbpath2):
                 found = [head for head in ["to_annotate", "gsize", "nb_conts", "L90"]
                          if head in column_order]
                 if len(found) != 4:
-                    logger.error("ERROR: Your information file does not have the required "
-                                 "columns, tab separated: to_annotate, gsize, nb_conts and L90 "
-                                 "(in any order) \n Ending program.")
+                    logger.error(message_no_header)
                     sys.exit(1)
                 continue
             # If no header found, error message and exit
             if not column_order:
-                logger.error("ERROR: It seems that your info file does not have a header, or "
-                             "this header does not have, at least, the required columns"
-                             "separated: to_annotate, gsize nb_conts and L90 (in any order).\n")
+                logger.error(message_no_header)
                 sys.exit(1)
             # Get all information on the given genome
             # line.strip().split() -> all given information.
@@ -758,42 +752,36 @@ def read_genomes_info(list_file, name, date, dbpath, dbpath2):
             try:
                 infos = line.strip().split()
                 # Get genome name with its path to db_dir
-                gname = infos[column_order["to_annotate"]]
-                gpath = os.path.join(dbpath, gname)
+                gpath = infos[column_order["to_annotate"]]
                 gsize = int(infos[column_order["gsize"]])
                 gl90 = int(infos[column_order["L90"]])
                 gcont = int(infos[column_order["nb_conts"]])
             # If invalid values, warning message and ignore genome
             except ValueError:
-                logger.warning("For genome {}, at least one of your columns 'gsize', 'nb_conts' "
-                               "or 'L90' contains a non numeric character. This genome will be "
-                               "ignored.".format(gname))
+                logger.warning(f"For genome {gname}, at least one of your columns 'gsize', "
+                                "'nb_conts' or 'L90' contains a non numeric character. "
+                                "This genome will be ignored.")
                 continue
             # If no value for at least 1 field, warning message and ignore genome
             except IndexError:
-                logger.error("ERROR: Check that all fields of {} are filled in each "
-                               "line (can be 'NA')".format(list_file))
-                sys.exit(-1)
-
+                logger.error("ERROR: Check that all fields of {list_file} are filled in each "
+                             "line (can be 'NA')")
+                sys.exit(1)
             # Could we find genome file?
             # Check if genome file exists in db_path.
             if not os.path.isfile(gpath):
-                if dbpath2:
-                    # If it does not exist, and there is a 2nd db_path, check if it exists there
-                    gpath = os.path.join(dbpath2, gname)
-                    # If still not in db_path2, warning message and ignore genome
-                    if not os.path.isfile(gpath):
-                        logger.warning("{} genome file does not exist in the given "
-                                       "database {} nor in the other directory ({}). "
-                                       "It will be ignored.".format(gname, dbpath, dbpath2))
-                        continue
-                else:
-                    logger.warning("{} genome file does not exist in the given "
-                                   "database {}. It will be ignored.".format(gname, dbpath))
-                    continue
+                logger.warning(f"{gpath} genome file does not exist. This genome will be ignored.")
+                continue
             # cur genome information to save:
             # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90]
-            genomes[gname] = [spegenus, gpath, gpath, gsize, gcont, gl90]
+            if name and date:
+                genomes[gpath] = [spegenus, gpath, gpath, gsize, gcont, gl90]
+            # If called from prepare, no need to rename genomes
+            else:
+                gfile = os.path.basename(gpath)
+                gname = os.path.splitext(gfile)[0]
+                genomes[gfile] = [gname, gpath, gpath, gsize, gcont, gl90]
+    logger.info(("Found {} genomes in total").format(len(genomes)))
     return genomes
 
 
@@ -1155,6 +1143,7 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
     return contigs, sizes
 
 
+
 def logger_thread(q):
     """
     Queue listener used in a thread to handle the logs put to a QueueHandler
-- 
GitLab