Skip to content
Snippets Groups Projects
Commit 8f5018d6 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Better read of --info file

- possible to give 2 different db_path (db_path and tmp_path for example)
- check that file exists in 1 of those folders
- get information, and warn if header not consistent
- warn if empty fields in --info file
parent 7eff7c24
No related branches found
No related tags found
No related merge requests found
...@@ -115,14 +115,15 @@ def main_from_parse(arguments): ...@@ -115,14 +115,15 @@ def main_from_parse(arguments):
""" """
cmd = "PanACoTA " + ' '.join(arguments.argv) cmd = "PanACoTA " + ' '.join(arguments.argv)
main(cmd, arguments.list_file, arguments.db_path, arguments.res_path, arguments.name, main(cmd, arguments.list_file, arguments.db_path, arguments.db_path2, arguments.res_path,
arguments.name,
arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads, arguments.date, arguments.l90, arguments.nbcont, arguments.cutn, arguments.threads,
arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir, arguments.force, arguments.qc_only, arguments.from_info, arguments.tmpdir,
arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only, arguments.annotdir, arguments.verbose, arguments.quiet, arguments.prodigal_only,
arguments.small) arguments.small)
def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5, def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont=999, cutn=5,
threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None, threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None,
verbose=0, quiet=False, prodigal_only=False, small=False): verbose=0, quiet=False, prodigal_only=False, small=False):
""" """
...@@ -280,7 +281,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn ...@@ -280,7 +281,7 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
"names.").format(list_file, db_path)) "names.").format(list_file, db_path))
sys.exit(-1) sys.exit(-1)
# Get L90, nbcontig, size for all genomes, and cut at stretches of 'N' if asked # Get L90, nbcontig, size for all genomes, and cut at stretches of 'N' if asked
# -> genome: [spegenus.date, to_annotate_path, size, nbcont, l90] # -> genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]
gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, prodigal_only, gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, prodigal_only,
logger, quiet=quiet) logger, quiet=quiet)
# --info <filename> option given: read information (L90, nb contigs...) from this file. # --info <filename> option given: read information (L90, nb contigs...) from this file.
...@@ -289,7 +290,17 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn ...@@ -289,7 +290,17 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
# orig_path is the path to the original sequence # orig_path is the path to the original sequence
# and to_annotate_path the path to the sequence to annotate (once split etc.) # and to_annotate_path the path to the sequence to annotate (once split etc.)
# Here, both are the same, as we take given sequences as is. # Here, both are the same, as we take given sequences as is.
genomes = utils.read_genomes_info(from_info, name, date, db_path, tmp_dir) genomes = utils.read_genomes_info(from_info, name, date, db_path, db_path2)
if not genomes:
if db_path2:
logger.error(("We did not find any genome listed in {} in {} nor in {}. "
"Please check your list to give valid genome "
"names.").format(from_info, db_path, db_path2))
else:
logger.error(("We did not find any genome listed in {} in the folder {}. "
"Please check your list to give valid genome "
"names.").format(from_info, db_path))
sys.exit(-1)
# STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs
# genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]} # genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]}
...@@ -298,12 +309,19 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn ...@@ -298,12 +309,19 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
# Get list of genomes kept (according to L90 and nbcont thresholds) # Get list of genomes kept (according to L90 and nbcont thresholds)
kept_genomes = {genome: info for genome, info in genomes.items() kept_genomes = {genome: info for genome, info in genomes.items()
if info[-2] <= nbcont and info[-1] <= l90} if info[-2] <= nbcont and info[-1] <= l90}
# Write discarded genomes to a file -> orig_name, gsize, nb_conts, L90 # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file, res_dir) utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file, res_dir)
logger.info("-> Original sequences folder: {}".format(db_path)) if cutn == 0 and prodigal_only:
logger.info("-> If original sequence not found in {}, look for it in {}, as it must " logger.info("-> Folder with original sequence files and sequences to annotate: {}".format(db_path))
"be a concatenation of several given sequence files.".format(db_path, tmp_dir)) logger.info("\t-> If original sequence or to annotate sequence not found in {}, "
logger.info("-> Folder with sequences to annotate: {}".format(tmp_dir)) "look for it in {}, as it must be a concatenation of several input sequence "
"files.".format(db_path, tmp_dir))
else:
logger.info("-> Original sequences folder: {}".format(db_path))
logger.info("\t-> If original sequence not found in {}, look for it in {}, as it must "
"be a concatenation of several input sequence files.".format(db_path, tmp_dir))
logger.info("-> Folder with sequences to annotate: {}".format(tmp_dir))
# If only QC, stop here. # If only QC, stop here.
if qc_only: if qc_only:
# Write information on genomes that would be annotated with the current # Write information on genomes that would be annotated with the current
...@@ -318,12 +336,11 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn ...@@ -318,12 +336,11 @@ def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn
# gsize, nbcont, L90]} # gsize, nbcont, L90]}
# Write lstinfo file (list of genomes kept with info on L90 etc.) # Write lstinfo file (list of genomes kept with info on L90 etc.)
utils.write_lstinfo(list_file, kept_genomes, res_dir) utils.write_lstinfo(list_file, kept_genomes, res_dir)
sys.exit(1)
# STEP 4. Annotate all kept genomes # STEP 4. Annotate all kept genomes
results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir,
prodigal_only, small, quiet=quiet) prodigal_only, small, quiet=quiet)
print(results)
sys.exit(1)
# List of genomes to format # List of genomes to format
results_ok = [genome for (genome, ok) in results.items() if ok] results_ok = [genome for (genome, ok) in results.items() if ok]
# If no genome was ok, no need to format them. Just print that no genome was annotated, # If no genome was ok, no need to format them. Just print that no genome was annotated,
...@@ -364,22 +381,6 @@ def build_parser(parser): ...@@ -364,22 +381,6 @@ def build_parser(parser):
""" """
from PanACoTA import utils from PanACoTA import utils
from textwrap import dedent
header = '''
___ _____ ___ _____ _____
( _`\ ( _ )( _`\ (_ _)( _ )
| |_) ) _ _ ___ | (_) || ( (_) _ | | | (_) |
| ,__/'/'_` )/' _ `\| _ || | _ /'_`\ | | | _ |
| | ( (_| || ( ) || | | || (_( )( (_) )| | | | | |
(_) `\__,_)(_) (_)(_) (_)(____/'`\___/'(_) (_) (_)
Large scale comparative genomics tools
-------------------------------------------
'''
print(dedent(header))
def gen_name(param): def gen_name(param):
if not utils.check_format(param): if not utils.check_format(param):
...@@ -450,6 +451,14 @@ def build_parser(parser): ...@@ -450,6 +451,14 @@ def build_parser(parser):
"least 4 columns, tab separated, with the following headers: " "least 4 columns, tab separated, with the following headers: "
"'orig_name', 'gsize', 'nb_conts', 'L90'. Any other column " "'orig_name', 'gsize', 'nb_conts', 'L90'. Any other column "
"will be ignored.") "will be ignored.")
optional.add_argument("-d2", dest="db_path2",
help="Path to 2nd folder containing all multifasta genome files. "
"Used if you run annotation of sequences found in 2 different "
"folders. For example, if you ran this module, and now want to "
"rerun it with different parameters, but using the already "
"generated sequences to annotate, some of them are in db_path "
"whereas others are in the previous tmp_path (sequences cut, "
"prokka with small headers...")
optional.add_argument("--prodigal", dest="prodigal_only", action="store_true", default=False, optional.add_argument("--prodigal", dest="prodigal_only", action="store_true", default=False,
help="Add this option if you only want syntactical annotation, given " help="Add this option if you only want syntactical annotation, given "
"by prodigal, and not functional annotation requiring prokka and " "by prodigal, and not functional annotation requiring prokka and "
...@@ -561,9 +570,10 @@ def check_args(parser, args): ...@@ -561,9 +570,10 @@ def check_args(parser, args):
# Message if user is giving a file with already calculated information # Message if user is giving a file with already calculated information
def nosplit_message(): def nosplit_message():
split = (" !! -> Your sequences will be used as is by PanACoTA. Be sure you already " split = (" !! Your sequences will be used as is by PanACoTA. Be sure you "
"split your sequences at each stretch of X 'N' if needed.\n") "already split your sequences at each stretch "
trust = (" -> PanACoTA will use the values (L90, nbcont) given in your info file. " "of X 'N' if needed.\n")
trust = ("\t-> PanACoTA will use the values (L90, nbcont) given in your info file. "
"It will ignore the genomes for which those values are incorrect. " "It will ignore the genomes for which those values are incorrect. "
"It will also ignore genomes with more than 999 contigs.") "It will also ignore genomes with more than 999 contigs.")
return split + trust return split + trust
...@@ -594,7 +604,7 @@ def check_args(parser, args): ...@@ -594,7 +604,7 @@ def check_args(parser, args):
"them. So, you cannot use both --cutN and --info") "them. So, you cannot use both --cutN and --info")
# WARNINGS # WARNINGS
# If user wants to cur genomes, warn him to check that it is on purpose (because default is cut at each 5'N') # If user wants to cut genomes, warn him to check that it is on purpose (because default is cut at each 5'N')
if args.cutn != 0 and not args.from_info: if args.cutn != 0 and not args.from_info:
message = (" !! Your genomes will be split when sequence contains at " message = (" !! Your genomes will be split when sequence contains at "
"least {}'N' at a stretch.").format(args.cutn) "least {}'N' at a stretch.").format(args.cutn)
...@@ -604,9 +614,20 @@ def check_args(parser, args): ...@@ -604,9 +614,20 @@ def check_args(parser, args):
print(colored(thresholds_message(args.l90, args.nbcont), "yellow")) print(colored(thresholds_message(args.l90, args.nbcont), "yellow"))
if args.from_info: if args.from_info:
print(colored(nosplit_message(), "yellow")) print(colored(nosplit_message(), "yellow"))
if not args.prodigal_only:
print(colored("\t-> Check that your genomes have unique headers, where headers "
"are the 20 first characters of first word of current "
"header", "yellow"))
print() print()
return args return args
# If db_path2 is used only with infofile
if args.db_path2 and not args.info:
parser.error("Using '-d2 <path>' means that you are running from an info file, whose "
"sequences come from 2 different folders (db_path and tmp_path if from a "
"previous run of this module). Use --info <info-file> or "
"remove '-d2 <path>' ")
if __name__ == '__main__': if __name__ == '__main__':
import argparse import argparse
......
...@@ -631,13 +631,15 @@ def read_genomes(list_file, name, date, dbpath, tmp_path): ...@@ -631,13 +631,15 @@ def read_genomes(list_file, name, date, dbpath, tmp_path):
return genomes return genomes
def read_genomes_info(list_file, name, date, dbpath, tmp_path): def read_genomes_info(list_file, name, date, dbpath, dbpath2):
""" """
Read a lstinfo file containing the list of genomes with information (L90, genome size etc.). Read a lstinfo file containing the list of genomes with information (L90, genome size etc.).
1 line per genome, 4 required columns (Others will just be ignored): 1 line per genome, 4 required columns (Others will just be ignored):
to_annotate gsize nb_conts L90 to_annotate gsize nb_conts L90
Check that the given genome file (to_annotate column) exists in dbpath. If not, check if it is in tmp_path. If in none of those 2 folders, put a warning message and ignore this file. Check that the given genome file (to_annotate column) exists in dbpath or in dbpath2
(files can be split into 2 different folders). If in none of those 2 folders, put a
warning message and ignore this file.
Parameters Parameters
---------- ----------
...@@ -648,11 +650,12 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): ...@@ -648,11 +650,12 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
date : str date : str
Default date Default date
dbpath : str dbpath : str
path to folder containing original genome files path to folder containing genome files to annotate
tmp_path : str dbpath2 : str
path to folder which will contain the genome files to use before annotation, if\ path to other folder which can contain the genome files to annotate. For example,
needed to change them from original file (for example, merging several contig files\ if it comes from a previous run of 'PanACoTA annotate' where sequences needed to be
in one file, split at each stretch of 5 'N', etc.). modified to be ready for annotation (for example, merging several contig files in one
file, split at each stretch of 5 'N', etc.).
Returns Returns
------- -------
...@@ -675,8 +678,7 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): ...@@ -675,8 +678,7 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
with open(list_file, "r") as lff: with open(list_file, "r") as lff:
for line in lff: for line in lff:
line = line.strip() line = line.strip()
# Skip header line. # Header line: Just get column number corresponding to each field
# Just get column number corresponding to each field
if "to_annotate" in line: if "to_annotate" in line:
column_headers = line.split("\t") column_headers = line.split("\t")
column_order = {header:num for num,header in enumerate(column_headers)} column_order = {header:num for num,header in enumerate(column_headers)}
...@@ -684,42 +686,55 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): ...@@ -684,42 +686,55 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
if head in column_order] if head in column_order]
if len(found) != 4: if len(found) != 4:
logger.error("ERROR: Your information file does not have the required " logger.error("ERROR: Your information file does not have the required "
"columns: to_annotate, gsize nb_conts and L90 (in any order) " "columns, tab separated: to_annotate, gsize, nb_conts and L90 "
"\n Ending program.") "(in any order) \n Ending program.")
sys.exit(1) sys.exit(1)
continue continue
# If no header found, error message and exit # If no header found, error message and exit
if not column_order: if not column_order:
logger.error("ERROR: It seems that your info file does not have a header, or " logger.error("ERROR: It seems that your info file does not have a header, or "
"this header does not have, at least, the required columns: " "this header does not have, at least, the required columns"
"to_annotate, gsize nb_conts and L90 (in any order).\n " "separated: to_annotate, gsize nb_conts and L90 (in any order).\n")
"Ending program.")
sys.exit(1) sys.exit(1)
# Get all information on the given genome # Get all information on the given genome
# genome, size, nb_cont, l90 = line.strip().split() # line.strip().split() -> all given information.
infos = line.strip().split() # So, at least to_annotate, gsize, nbcont, L90
# Get genome name with its path to db_dir
gname = infos[column_order["to_annotate"]]
gpath = os.path.join(dbpath, gname)
# Get numeric information # Get numeric information
try: try:
infos = line.strip().split()
# Get genome name with its path to db_dir
gname = infos[column_order["to_annotate"]]
gpath = os.path.join(dbpath, gname)
gsize = int(infos[column_order["gsize"]]) gsize = int(infos[column_order["gsize"]])
gl90 = int(infos[column_order["L90"]]) gl90 = int(infos[column_order["L90"]])
gcont = int(infos[column_order["nb_conts"]]) gcont = int(infos[column_order["nb_conts"]])
# If invalid values, warnng message and ignore genome # If invalid values, warning message and ignore genome
except ValueError: except ValueError:
logger.warning("For genome {}, at least one of your columns 'gsize', 'nb_conts' or 'L90' contains a non numeric character. This genome will be ignored.".format(gname)) logger.warning("For genome {}, at least one of your columns 'gsize', 'nb_conts' "
"or 'L90' contains a non numeric character. This genome will be "
"ignored.".format(gname))
continue continue
# Check if genome file exists # If no value for at least 1 field, warning message and ignore genome
except IndexError:
logger.error("ERROR: Check that all fields of {} are filled in each "
"line (can be 'NA')".format(list_file))
sys.exit(-1)
# Could we find genome file?
# Check if genome file exists in db_path.
if not os.path.isfile(gpath): if not os.path.isfile(gpath):
# If it does not exist, look at it in tmp_file (can come from a concatenation if dbpath2:
# of several files in db_path) # If it does not exist, and there is a 2nd db_path, check if it exists there
gpath = os.path.join(tmp_path, gname) gpath = os.path.join(dbpath2, gname)
# If still not in tmp_dir, warning message and ignore genome # If still not in db_path2, warning message and ignore genome
if not os.path.isfile(gpath): if not os.path.isfile(gpath):
logger.warning(("{} genome file does not exist in the given " logger.warning("{} genome file does not exist in the given "
"database nor in your given tmp directory. " "database {} nor in the other directory ({}). "
"It will be ignored.".format(gpath))) "It will be ignored.".format(gname, dbpath, dbpath2))
continue
else:
logger.warning("{} genome file does not exist in the given "
"database {}. It will be ignored.".format(gname, dbpath))
continue continue
# cur genome information to save: # cur genome information to save:
# [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90] # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment