diff --git a/PanACoTA/subcommands/annotate.py b/PanACoTA/subcommands/annotate.py index 54af752b0d734bf0722a2aab6960ed2ba0f2fc76..e07413f24d68cc9551630b4eeb3d9c0fb6668e9f 100755 --- a/PanACoTA/subcommands/annotate.py +++ b/PanACoTA/subcommands/annotate.py @@ -115,14 +115,15 @@ def main_from_parse(arguments): """ cmd = "PanACoTA " + ' '.join(arguments.argv) - main(cmd, arguments.list_file, arguments.db_path, arguments.res_path, arguments.name, + main(cmd, arguments.list_file, arguments.db_path, arguments.db_path2, arguments.res_path, + arguments.name, arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads, arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir, arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only, arguments.small) -def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5, +def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont=999, cutn=5, threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None, verbose=0, quiet=False, prodigal_only=False, small=False): """ @@ -280,7 +281,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn "names.").format(list_file, db_path)) sys.exit(-1) # Get L90, nbcontig, size for all genomes, and cut at stretches of 'N' if asked - # -> genome: [spegenus.date, to_annotate_path, size, nbcont, l90] + # -> genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90] gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, prodigal_only, logger, quiet=quiet) # --info <filename> option given: read information (L90, nb contigs...) from this file. @@ -289,7 +290,17 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn # orig_path is the path to the original sequence # and to_annotate_path the path to the sequence to annotate (once split etc.) # Here, both are the same, as we take given sequences as is. - genomes = utils.read_genomes_info(from_info, name, date, db_path, tmp_dir) + genomes = utils.read_genomes_info(from_info, name, date, db_path, db_path2) + if not genomes: + if db_path2: + logger.error(("We did not find any genome listed in {} in {} nor in {}. " + "Please check your list to give valid genome " + "names.").format(from_info, db_path, db_path2)) + else: + logger.error(("We did not find any genome listed in {} in the folder {}. " + "Please check your list to give valid genome " + "names.").format(from_info, db_path)) + sys.exit(-1) # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs # genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]} @@ -298,12 +309,19 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn # Get list of genomes kept (according to L90 and nbcont thresholds) kept_genomes = {genome: info for genome, info in genomes.items() if info[-2] <= nbcont and info[-1] <= l90} - # Write discarded genomes to a file -> orig_name, gsize, nb_conts, L90 + # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file, res_dir) - logger.info("-> Original sequences folder: {}".format(db_path)) - logger.info("-> If original sequence not found in {}, look for it in {}, as it must " - "be a concatenation of several given sequence files.".format(db_path, tmp_dir)) - logger.info("-> Folder with sequences to annotate: {}".format(tmp_dir)) + if cutn == 0 and prodigal_only: + logger.info("-> Folder with original sequence files and sequences to annotate: {}".format(db_path)) + logger.info("\t-> If original sequence or to annotate sequence not found in {}, " + "look for it in {}, as it must be a concatenation of several input sequence " + "files.".format(db_path, tmp_dir)) + else: + logger.info("-> Original sequences folder: {}".format(db_path)) + logger.info("\t-> If original sequence not found in {}, look for it in {}, as it must " + "be a concatenation of several input sequence files.".format(db_path, tmp_dir)) + logger.info("-> Folder with sequences to annotate: {}".format(tmp_dir)) + # If only QC, stop here. if qc_only: # Write information on genomes that would be annotated with the current @@ -318,12 +336,11 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn # gsize, nbcont, L90]} # Write lstinfo file (list of genomes kept with info on L90 etc.) utils.write_lstinfo(list_file, kept_genomes, res_dir) + sys.exit(1) # STEP 4. Annotate all kept genomes results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, prodigal_only, small, quiet=quiet) - print(results) - sys.exit(1) # List of genomes to format results_ok = [genome for (genome, ok) in results.items() if ok] # If no genome was ok, no need to format them. Just print that no genome was annotated, @@ -364,22 +381,6 @@ def build_parser(parser): """ from PanACoTA import utils - from textwrap import dedent - - header = ''' - ___ _____ ___ _____ _____ -( _`\ ( _ )( _`\ (_ _)( _ ) -| |_) ) _ _ ___ | (_) || ( (_) _ | | | (_) | -| ,__/'/'_` )/' _ `\| _ || | _ /'_`\ | | | _ | -| | ( (_| || ( ) || | | || (_( )( (_) )| | | | | | -(_) `\__,_)(_) (_)(_) (_)(____/'`\___/'(_) (_) (_) - - - Large scale comparative genomics tools - - ------------------------------------------- - ''' - print(dedent(header)) def gen_name(param): if not utils.check_format(param): @@ -450,6 +451,14 @@ def build_parser(parser): "least 4 columns, tab separated, with the following headers: " "'orig_name', 'gsize', 'nb_conts', 'L90'. Any other column " "will be ignored.") + optional.add_argument("-d2", dest="db_path2", + help="Path to 2nd folder containing all multifasta genome files. " + "Used if you run annotation of sequences found in 2 different " + "folders. For example, if you ran this module, and now want to " + "rerun it with different parameters, but using the already " + "generated sequences to annotate, some of them are in db_path " + "whereas others are in the previous tmp_path (sequences cut, " + "prokka with small headers...") optional.add_argument("--prodigal", dest="prodigal_only", action="store_true", default=False, help="Add this option if you only want syntactical annotation, given " "by prodigal, and not functional annotation requiring prokka and " @@ -561,9 +570,10 @@ def check_args(parser, args): # Message if user is giving a file with already calculated information def nosplit_message(): - split = (" !! -> Your sequences will be used as is by PanACoTA. Be sure you already " - "split your sequences at each stretch of X 'N' if needed.\n") - trust = (" -> PanACoTA will use the values (L90, nbcont) given in your info file. " + split = (" !! Your sequences will be used as is by PanACoTA. Be sure you " + "already split your sequences at each stretch " + "of X 'N' if needed.\n") + trust = ("\t-> PanACoTA will use the values (L90, nbcont) given in your info file. " "It will ignore the genomes for which those values are incorrect. " "It will also ignore genomes with more than 999 contigs.") return split + trust @@ -594,7 +604,7 @@ def check_args(parser, args): "them. So, you cannot use both --cutN and --info") # WARNINGS - # If user wants to cur genomes, warn him to check that it is on purpose (because default is cut at each 5'N') + # If user wants to cut genomes, warn him to check that it is on purpose (because default is cut at each 5'N') if args.cutn != 0 and not args.from_info: message = (" !! Your genomes will be split when sequence contains at " "least {}'N' at a stretch.").format(args.cutn) @@ -604,9 +614,20 @@ def check_args(parser, args): print(colored(thresholds_message(args.l90, args.nbcont), "yellow")) if args.from_info: print(colored(nosplit_message(), "yellow")) + if not args.prodigal_only: + print(colored("\t-> Check that your genomes have unique headers, where headers " + "are the 20 first characters of first word of current " + "header", "yellow")) print() return args + # If db_path2 is used only with infofile + if args.db_path2 and not args.info: + parser.error("Using '-d2 <path>' means that you are running from an info file, whose " + "sequences come from 2 different folders (db_path and tmp_path if from a " + "previous run of this module). Use --info <info-file> or " + "remove '-d2 <path>' ") + if __name__ == '__main__': import argparse diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py index 812870196314d881d533bb7407d49f7f07c61299..102de493770f060ef0d97c384ec78c7dbed0b963 100755 --- a/PanACoTA/utils.py +++ b/PanACoTA/utils.py @@ -631,13 +631,15 @@ def read_genomes(list_file, name, date, dbpath, tmp_path): return genomes -def read_genomes_info(list_file, name, date, dbpath, tmp_path): +def read_genomes_info(list_file, name, date, dbpath, dbpath2): """ Read a lstinfo file containing the list of genomes with information (L90, genome size etc.). 1 line per genome, 4 required columns (Others will just be ignored): to_annotate gsize nb_conts L90 - Check that the given genome file (to_annotate column) exists in dbpath. If not, check if it is in tmp_path. If in none of those 2 folders, put a warning message and ignore this file. + Check that the given genome file (to_annotate column) exists in dbpath or in dbpath2 + (files can be split into 2 different folders). If in none of those 2 folders, put a + warning message and ignore this file. Parameters ---------- @@ -648,11 +650,12 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): date : str Default date dbpath : str - path to folder containing original genome files - tmp_path : str - path to folder which will contain the genome files to use before annotation, if\ - needed to change them from original file (for example, merging several contig files\ - in one file, split at each stretch of 5 'N', etc.). + path to folder containing genome files to annotate + dbpath2 : str + path to other folder which can contain the genome files to annotate. For example, + if it comes from a previous run of 'PanACoTA annotate' where sequences needed to be + modified to be ready for annotation (for example, merging several contig files in one + file, split at each stretch of 5 'N', etc.). Returns ------- @@ -675,8 +678,7 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): with open(list_file, "r") as lff: for line in lff: line = line.strip() - # Skip header line. - # Just get column number corresponding to each field + # Header line: Just get column number corresponding to each field if "to_annotate" in line: column_headers = line.split("\t") column_order = {header:num for num,header in enumerate(column_headers)} @@ -684,42 +686,55 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): if head in column_order] if len(found) != 4: logger.error("ERROR: Your information file does not have the required " - "columns: to_annotate, gsize nb_conts and L90 (in any order) " - "\n Ending program.") + "columns, tab separated: to_annotate, gsize, nb_conts and L90 " + "(in any order) \n Ending program.") sys.exit(1) continue # If no header found, error message and exit if not column_order: logger.error("ERROR: It seems that your info file does not have a header, or " - "this header does not have, at least, the required columns: " - "to_annotate, gsize nb_conts and L90 (in any order).\n " - "Ending program.") + "this header does not have, at least, the required columns" + "separated: to_annotate, gsize nb_conts and L90 (in any order).\n") sys.exit(1) # Get all information on the given genome - # genome, size, nb_cont, l90 = line.strip().split() - infos = line.strip().split() - # Get genome name with its path to db_dir - gname = infos[column_order["to_annotate"]] - gpath = os.path.join(dbpath, gname) + # line.strip().split() -> all given information. + # So, at least to_annotate, gsize, nbcont, L90 # Get numeric information try: + infos = line.strip().split() + # Get genome name with its path to db_dir + gname = infos[column_order["to_annotate"]] + gpath = os.path.join(dbpath, gname) gsize = int(infos[column_order["gsize"]]) gl90 = int(infos[column_order["L90"]]) gcont = int(infos[column_order["nb_conts"]]) - # If invalid values, warnng message and ignore genome + # If invalid values, warning message and ignore genome except ValueError: - logger.warning("For genome {}, at least one of your columns 'gsize', 'nb_conts' or 'L90' contains a non numeric character. This genome will be ignored.".format(gname)) + logger.warning("For genome {}, at least one of your columns 'gsize', 'nb_conts' " + "or 'L90' contains a non numeric character. This genome will be " + "ignored.".format(gname)) continue - # Check if genome file exists + # If no value for at least 1 field, warning message and ignore genome + except IndexError: + logger.error("ERROR: Check that all fields of {} are filled in each " + "line (can be 'NA')".format(list_file)) + sys.exit(-1) + + # Could we find genome file? + # Check if genome file exists in db_path. if not os.path.isfile(gpath): - # If it does not exist, look at it in tmp_file (can come from a concatenation - # of several files in db_path) - gpath = os.path.join(tmp_path, gname) - # If still not in tmp_dir, warning message and ignore genome - if not os.path.isfile(gpath): - logger.warning(("{} genome file does not exist in the given " - "database nor in your given tmp directory. " - "It will be ignored.".format(gpath))) + if dbpath2: + # If it does not exist, and there is a 2nd db_path, check if it exists there + gpath = os.path.join(dbpath2, gname) + # If still not in db_path2, warning message and ignore genome + if not os.path.isfile(gpath): + logger.warning("{} genome file does not exist in the given " + "database {} nor in the other directory ({}). " + "It will be ignored.".format(gname, dbpath, dbpath2)) + continue + else: + logger.warning("{} genome file does not exist in the given " + "database {}. It will be ignored.".format(gname, dbpath)) continue # cur genome information to save: # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90]