diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py index 0246054d300a3ca87aada5707d2943e4eb69b0f5..812870196314d881d533bb7407d49f7f07c61299 100755 --- a/PanACoTA/utils.py +++ b/PanACoTA/utils.py @@ -387,10 +387,15 @@ def write_warning_skipped(skipped, do_format=False, prodigal_only=False, logfile "formatted.\n{1}").format(soft, list_to_write)) -def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False): +def write_genomes_info(genomes, kept_genomes, list_file, res_path, qc=False): """ - Write the list of genomes discarded to a file, so that users can keep a trace of them, - with their information (nb contigs, L90 etc.) + Write the list of genomes discarded to a file (qc=False), so that users can + keep a trace of them, with their information (nb contigs, L90 etc.) + + If qc=True, we stop after QC. + -> Write the list of genomes that would be kept for annotation with all + their information (L90, size, #contig) + Parameters ---------- @@ -404,9 +409,11 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False): res_path : str folder where results must be saved qc : bool - if it is the file written after qc only (True), call it info-genomes-<list_file>.txt\ - otherwise (False), call it discarded-<list_file>.txt - + * True: called only if QC only. Name this file info-genomes-<list_file>.txt to put + information on genomes that would be annotated if not QC only + * otherwise (False), called in any case. Name this file discarded-<list_file>.txt + and write all discarded genomes, whether sequences kept are next annotated or not + => columns: orig_name, to_annotate, gsize, nb_conts, L90 """ logger = logging.getLogger("utils") # number of genomes discarded @@ -430,12 +437,13 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False): "info-genomes-" + ".".join(name_lst.split(".")[:-1]) + ".lst") logger.info("Writing information on genomes in {}".format(outdisc)) with open(outdisc, "w") as outdf: - outdf.write("\t".join(["orig_name", "gsize", "nb_conts", "L90"]) + "\n") + outdf.write("\t".join(["orig_name", "to_annotate", "gsize", "nb_conts", "L90"]) + "\n") for genome, values in genomes.items(): if genome in kept_genomes: continue - _, _, _, gsize, nbcont, l90 = [str(x) for x in values] - outdf.write("\t".join([genome, gsize, nbcont, l90]) + "\n") + _, _, to_annotate, gsize, nbcont, l90 = [str(x) for x in values] + to_annotate_file = os.path.basename(to_annotate) + outdf.write("\t".join([genome, to_annotate_file, gsize, nbcont, l90]) + "\n") def write_lstinfo(list_file, genomes, outdir): @@ -626,11 +634,10 @@ def read_genomes(list_file, name, date, dbpath, tmp_path): def read_genomes_info(list_file, name, date, dbpath, tmp_path): """ Read a lstinfo file containing the list of genomes with information (L90, genome size etc.). - 1 line per genome, 4 or 5 columns: - [gembase_name] orig_name gsize nb_conts L90 + 1 line per genome, 4 required columns (Others will just be ignored): + to_annotate gsize nb_conts L90 - Check that the given genome file exists in dbpath. Otherwise, put an error message, - and ignore this file. + Check that the given genome file (to_annotate column) exists in dbpath. If not, check if it is in tmp_path. If in none of those 2 folders, put a warning message and ignore this file. Parameters ---------- @@ -670,40 +677,52 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): line = line.strip() # Skip header line. # Just get column number corresponding to each field - if "orig_name" in line: + if "to_annotate" in line: column_headers = line.split("\t") column_order = {header:num for num,header in enumerate(column_headers)} - found = [head for head in ["orig_name", "gsize", "nb_conts", "L90"] + found = [head for head in ["to_annotate", "gsize", "nb_conts", "L90"] if head in column_order] if len(found) != 4: logger.error("ERROR: Your information file does not have the required " - "columns: orig_name, gsize nb_conts and L90 (in any order) " + "columns: to_annotate, gsize nb_conts and L90 (in any order) " "\n Ending program.") sys.exit(1) continue + # If no header found, error message and exit if not column_order: logger.error("ERROR: It seems that your info file does not have a header, or " - "this header does not have the required columns: orig_name, gsize " - " nb_conts and L90 (in any order).\n Ending program.") + "this header does not have, at least, the required columns: " + "to_annotate, gsize nb_conts and L90 (in any order).\n " + "Ending program.") sys.exit(1) # Get all information on the given genome # genome, size, nb_cont, l90 = line.strip().split() infos = line.strip().split() - gname = infos[column_order["orig_name"]] + # Get genome name with its path to db_dir + gname = infos[column_order["to_annotate"]] gpath = os.path.join(dbpath, gname) + # Get numeric information try: gsize = int(infos[column_order["gsize"]]) gl90 = int(infos[column_order["L90"]]) gcont = int(infos[column_order["nb_conts"]]) + # If invalid values, warnng message and ignore genome except ValueError: logger.warning("For genome {}, at least one of your columns 'gsize', 'nb_conts' or 'L90' contains a non numeric character. This genome will be ignored.".format(gname)) continue + # Check if genome file exists if not os.path.isfile(gpath): - logger.warning(("{} genome file does not exist in the given database. " - "It will be ignored.".format(gpath))) - continue + # If it does not exist, look at it in tmp_file (can come from a concatenation + # of several files in db_path) + gpath = os.path.join(tmp_path, gname) + # If still not in tmp_dir, warning message and ignore genome + if not os.path.isfile(gpath): + logger.warning(("{} genome file does not exist in the given " + "database nor in your given tmp directory. " + "It will be ignored.".format(gpath))) + continue # cur genome information to save: - # [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90] + # [spegenus.date, path_orig_seq, path_to_sequence_to_annotate, size, nbcont, l90] genomes[gname] = [spegenus, gpath, gpath, gsize, gcont, gl90] return genomes