diff --git a/PanACoTA/annotate_module/format_prodigal.py b/PanACoTA/annotate_module/format_prodigal.py index ea52a13f1b89d4daa1c25f7fc85c20beeca4aefc..d622aae3b34f37757cfcc7a058ba6a78df512f65 100644 --- a/PanACoTA/annotate_module/format_prodigal.py +++ b/PanACoTA/annotate_module/format_prodigal.py @@ -152,14 +152,16 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): bool : True if conversion went well, False otherwise """ - # Variable which will contain the current gene sequence seq = "" # number of the current gene (first gene is 1, 2nd is 2 etc. each number is unique: do not # re-start at 1 for each new contig) locus_num = 1 - # contig number of the last gene. To check if we are now in a new contig (-> loc = b) or not + # contig name of the last gene. To check if we are now in a new contig (-> loc = b) or not + prev_cont_name = "" + # Previous ontig number: contig number to use in gembase format prev_cont_num = 0 + contig_num = 0 # Keep start, end, strand and informations (prodigal gives information on start_type, # gc_cont etc.) from the previous gene, before overwriting it with information # on the new one @@ -190,20 +192,21 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): # Get information given for the new gene (by .ffn file from prodigal) (gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#") # Get contig number from prodigal gene header: prodigal first part of header is: - # <original genome name>_<contig number>_<protein number> - contig_num = int(gname.strip().split("_")[-2]) - + # <original genome name contig name>_<protein number> + contig_name = "_".join(gname.strip().split("_")[:-1]) # If new contig: # - previous gene was the last of its contig -> prev_loc = "b" ; # - the current gene is the first of its contig (loc = "b") - if contig_num != prev_cont_num: + # - we must increment the contig number + if contig_name != prev_cont_name: + contig_num += 1 prev_loc = 'b' loc = 'b' # If not new contig. If prev_loc == 'b', previous gene is the first protein # of this contig. # Current gene will be inside the contig (except if new contig for the next gene, # meaning only 1 gene in the contig) - if contig_num == prev_cont_num and prev_loc == "b": + if contig_name == prev_cont_name and prev_loc == "b": loc = 'i' # If it is the first gene of the genome, we do not have any "previous gene" @@ -212,6 +215,7 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): if prev_start == "": prev_start = start prev_end = end + prev_cont_name = contig_name prev_cont_num = contig_num prev_info = info # Convert strand 1/-1 to D/C @@ -239,6 +243,7 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger): locus_num += 1 seq = "" prev_cont_num = contig_num + prev_cont_name = contig_name prev_start = start prev_end = end prev_strand = strand diff --git a/PanACoTA/annotate_module/format_prokka.py b/PanACoTA/annotate_module/format_prokka.py index 76c1e9dbc3c882de1aded2c0da3a4e3d56735cdc..4faa26f2be9a50c5c5176c79a44e733fc59e3577 100644 --- a/PanACoTA/annotate_module/format_prokka.py +++ b/PanACoTA/annotate_module/format_prokka.py @@ -79,8 +79,6 @@ def format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, gene_dir, res_prt_file = os.path.join(prot_dir, name + ".prt") res_rep_file = os.path.join(rep_dir, name + ".fna") - # Convert prokka tbl file to gembase .lst file format - tbl2lst(prokka_tbl_file, res_lst_file, name, logger) # Generate replicon file (same as input sequence but with gembase formatted headers). From # this file, get contig names, to be used to generate gff file @@ -97,6 +95,12 @@ def format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, gene_dir, logger.error("Problems while generating Replicon file for {}".format(name)) return False + # Convert prokka tbl file to gembase .lst file format + tbl2lst(prokka_tbl_file, res_lst_file, contigs, name, logger) + + import sys + sys.exit(1) + # Create gff3 file for annotations ok_gff = generate_gff(gpath, prokka_gff_file, res_gff_file, res_lst_file, contigs, logger) if not ok_gff: @@ -442,7 +446,7 @@ def create_prt(faaseq, lstfile, prtseq, logger): return not problem -def tbl2lst(tblfile, lstfile, genome, logger): +def tbl2lst(tblfile, lstfile, contigs, genome, logger): """ Read prokka tbl file, and convert it to the lst file. @@ -473,6 +477,8 @@ def tbl2lst(tblfile, lstfile, genome, logger): name of prokka output tbl file to read lstfile : str name of lst file to generate + contigs : list + List of all contigs with their size ["contig1'\t'size1", "contig2'\t'size2" ...] genome : str genome name (gembase format) logger : logging.Logger @@ -489,9 +495,9 @@ def tbl2lst(tblfile, lstfile, genome, logger): cont_loc = "b" prev_cont_loc = "b" # Current contig number. Used to compare with new one, to know if protein is - # inside or at the boder of a contig - cont_num = "-1" - prev_cont_num = "-1" + # inside or at the border of a contig + cont_num = 0 + prev_cont_num = -1 # Information on current feature. At the beginning, everything empty, no information gene_name = "NA" product = "NA" @@ -512,7 +518,7 @@ def tbl2lst(tblfile, lstfile, genome, logger): # 2 elements: ">Feature" feature_name if line.startswith(">Feature"): contig = line.strip().split()[-1] - cont_num = contig.split("_")[-1] # -> 0010 + cont_num += 1 else: # Get line type, and retrieve info according to it # If it is not the line with start, end, type, there are only 2 elements @@ -531,16 +537,25 @@ def tbl2lst(tblfile, lstfile, genome, logger): inf2 = elems[1] if "db_xref" in elems[0]: db_xref = elems[1] - # 3 elements = start end and type of the feature + # new gene: + # 3 elements = start end and type of the feature else: - # Check contig: new or same as previous? + # Check contig: new or same as previous? # New contig - if int(cont_num) != int(prev_cont_num): - # Check if it is the contig just following the previous one, - # or if there are contigs without any feature between them. - if int(prev_cont_num) != -1 and int(cont_num) != int(prev_cont_num) + 1: + if cont_num != prev_cont_num: + # Check if + # - it is not the first gene of the genome + # - contig just following the previous one, or there are contigs + # without any feature between them? + # - this new contig is as expected (next contig of the list of + # contigs generated during Replicons file generation) + if prev_cont_num != -1 and cont_num != prev_cont_num + 1: logger.details("No feature found in contigs between contig {} and " "contig {}.".format(prev_cont_num, cont_num)) + cont_name_found = f"{genome}.{str(cont_num).zfill(4)}" + cont_name_exp = contigs[prev_cont_num - 1] + if cont_name_found != cont_name_exp: + # Previous loc was 'i' (because we were in the same contig). # But we now change contig, so this previous feature was the last # of its contigs -> prev_loc = "b" @@ -560,12 +575,15 @@ def tbl2lst(tblfile, lstfile, genome, logger): # If not first feature, write the previous feature to .lst file # (The first feature will be written once 2nd feature as been read) if start != -1 and end != -1: - crispr_num, _ = general.write_gene(feature_type, locus_num, gene_name, - product, crispr_num, prev_cont_loc, - genome, prev_cont_num, - ecnum, inf2, db_xref, - strand, start, end, lstf) - + crispr_num, lstline = general.write_gene(feature_type, locus_num, + gene_name, product, crispr_num, + prev_cont_loc, genome, + prev_cont_num, ecnum, inf2, + db_xref, strand, start, end, lstf) + print(cont_name) + print(f"contig name from rep file {}") + import sys + sys.exit(1) # Get new values for start, end, strand and feature type start, end, feature_type = elems # Get strain of gene @@ -575,7 +593,7 @@ def tbl2lst(tblfile, lstfile, genome, logger): else: strand = "D" # Initialize variables for next feature (except start, end, strand - # and feature type which are just calculated) + # and feature type which just calculated) prev_cont_num = cont_num prev_cont_loc = cont_loc locus_num = "NA" diff --git a/PanACoTA/annotate_module/general_format_functions.py b/PanACoTA/annotate_module/general_format_functions.py index a2f3972b523e68ff00d72e8a3af0aefd8b9fed16..1ec1caf1a987098585a3e43da9b9bb37d442be9c 100644 --- a/PanACoTA/annotate_module/general_format_functions.py +++ b/PanACoTA/annotate_module/general_format_functions.py @@ -30,7 +30,7 @@ import PanACoTA.annotate_module.format_prokka as fprokka import PanACoTA.annotate_module.format_prodigal as fprodigal -def format_genomes(genomes, results, res_path, annot_path, prodigal_only, threads=1, quiet=False): +def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False): """ For all genomes which were annotated (by prokka or prodigal), reformat them in order to have, in 'res_path', the following folders: @@ -43,10 +43,8 @@ def format_genomes(genomes, results, res_path, annot_path, prodigal_only, thread Parameters ---------- - genomes : dict - {genome: [name, gpath, size, nbcont, l90]} - results : list - List of genome annotated by prokka/prodigal, that must be formatted + genomes_ok : dict + genomes to format (annotation was OK) -> {genome: [name, gpath, size, nbcont, l90]} res_path : str path to folder where the 4 directories must be created annot_path : str @@ -80,7 +78,7 @@ def format_genomes(genomes, results, res_path, annot_path, prodigal_only, thread os.makedirs(gff_dir, exist_ok=True) # If this function goes until here, it means that there is at least 1 genome to annotate - nbgen = len(genomes) + nbgen = len(genomes_ok) bar = None if not quiet: # Create progressbar @@ -94,9 +92,12 @@ def format_genomes(genomes, results, res_path, annot_path, prodigal_only, thread q = m.Queue() # if at least 1 genome ok, try to format it + # arguments for 'handle_genome' function: + # (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir, + # gff_dir, results, prodigal_only, q) params = [(genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, - rep_dir, gff_dir, results, prodigal_only, q) - for genome, (name, gpath, _, _, _) in genomes.items()] + rep_dir, gff_dir, prodigal_only, q) + for genome, (name, _, gpath, _, _, _) in genomes_ok.items()] # Create pool and launch parallel formating steps pool = multiprocessing.Pool(threads) @@ -146,7 +147,6 @@ def handle_genome(args): * gene_dit : path to 'Genes' folder * rep_dir : path to 'Replicons' folder * gff_dir : path to 'gff3' folder - * results : {genome_name: <True if genome was correctly annotated, False otherwise>} * prodigal_only : True if annotated by prodigal, False if annotated by prokka * q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess @@ -158,7 +158,7 @@ def handle_genome(args): * genome name (used to get info from the pool.map_async) """ (genome, name, gpath, annot_path, lst_dir, prot_dir, - gene_dir, rep_dir, gff_dir, results, prodigal_only, q) = args + gene_dir, rep_dir, gff_dir, prodigal_only, q) = args # Define which formatting must be used, given the annotation software if prodigal_only: @@ -174,9 +174,7 @@ def handle_genome(args): logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('format.handle_genome') - # Handle genome, if we have its informations - if genome not in results: - return "no_res", genome + # Handle genome ok_format = format_one_genome(gpath, name, annot_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir, logger) return ok_format, genome @@ -226,8 +224,8 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, Returns ------- - int : - Current crispr number + tuple : + Current crispr number, lstline """ # if last gene was a crispr if gtype == "repeat_region": diff --git a/PanACoTA/annotate_module/prokka_prodigal_functions.py b/PanACoTA/annotate_module/prokka_prodigal_functions.py index 21e1169b66aaad2732035085b71c3dbe1b7c5a1e..ac6ab95dc8759a1590a1d86082c4046fb618b9a5 100755 --- a/PanACoTA/annotate_module/prokka_prodigal_functions.py +++ b/PanACoTA/annotate_module/prokka_prodigal_functions.py @@ -185,7 +185,7 @@ def run_prokka(arguments): logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('run_prokka') - logger.log(utils.detail_lvl(), "Start annotating {} {} with Prokka".format(name, gpath)) + logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka") # Define prokka directory and logfile, and check their existence prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes") @@ -227,7 +227,7 @@ def run_prokka(arguments): # So, outdir does not exist -> run prokka cmd = ("prokka --outdir {} --cpus {} " "--prefix {} {}").format(prok_dir, threads, name, gpath) - error = "Error while trying to run prokka" + error = (f"Error while trying to run prokka on {name} from {gpath}") logger.details("Prokka command: " + cmd) prokf = open(prok_logfile, "w") ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf) @@ -276,8 +276,8 @@ def run_prodigal(arguments): logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('run_prodigal') - logger.log(utils.detail_lvl(), "Start annotating {} (from {} sequence) " - "with Prodigal".format(name, gpath)) + logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) " + "with Prodigal") # Define prodigal directory and logfile, and check their existence # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes diff --git a/PanACoTA/subcommands/annotate.py b/PanACoTA/subcommands/annotate.py index e07413f24d68cc9551630b4eeb3d9c0fb6668e9f..b32fdd0639f354ea481dd5b51d71246a47017360 100755 --- a/PanACoTA/subcommands/annotate.py +++ b/PanACoTA/subcommands/annotate.py @@ -212,6 +212,11 @@ def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont # Check that needed softs are installed prokka = utils.check_installed("prokka") prodigal = utils.check_installed("prodigal") + if prodigal_only: + soft = "prodigal" + else: + soft = "prokka" + if not qc_only: # If user using prokka: check prokka is installed and in the path if not prodigal_only and not prokka: @@ -269,7 +274,7 @@ def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont logger.info("Let's start!") # STEP 1. analyze genomes (nb contigs, L90, stretches of N...) - # If already info on genome (--info <file> option), skip this step + # If already info on genome ('--info <file>' option), skip this step # If no info on genomes, read them and get needed information if not from_info: # Read genome names. @@ -286,20 +291,20 @@ def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont logger, quiet=quiet) # --info <filename> option given: read information (L90, nb contigs...) from this file. else: - # genomes = genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90] + # genomes = {genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]} # orig_path is the path to the original sequence # and to_annotate_path the path to the sequence to annotate (once split etc.) # Here, both are the same, as we take given sequences as is. genomes = utils.read_genomes_info(from_info, name, date, db_path, db_path2) if not genomes: if db_path2: - logger.error(("We did not find any genome listed in {} in {} nor in {}. " + logger.error(("We did not find any genome listed in {} in {} folder nor in {}. " "Please check your list to give valid genome " "names.").format(from_info, db_path, db_path2)) else: logger.error(("We did not find any genome listed in {} in the folder {}. " - "Please check your list to give valid genome " - "names.").format(from_info, db_path)) + "Please check your list to give valid genome " + "names.").format(from_info, db_path)) sys.exit(-1) # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs @@ -311,16 +316,19 @@ def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont if info[-2] <= nbcont and info[-1] <= l90} # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file, res_dir) - if cutn == 0 and prodigal_only: - logger.info("-> Folder with original sequence files and sequences to annotate: {}".format(db_path)) - logger.info("\t-> If original sequence or to annotate sequence not found in {}, " + # Info on folder containing original sequences + if not from_info: + logger.info("-> Original sequences folder (corresponding to 'orig-name' column in " + "'{}/LSTINFO-<input_file>.lst'): {} ".format(res_dir, db_path)) + logger.info("\t-> If original sequence not found in {}, " "look for it in {}, as it must be a concatenation of several input sequence " "files.".format(db_path, tmp_dir)) - else: - logger.info("-> Original sequences folder: {}".format(db_path)) - logger.info("\t-> If original sequence not found in {}, look for it in {}, as it must " - "be a concatenation of several input sequence files.".format(db_path, tmp_dir)) - logger.info("-> Folder with sequences to annotate: {}".format(tmp_dir)) + if prodigal_only and cutn == 0: + logger.info("-> Sequences used for annotation ('to_annotate' column) are the " + "same as the previous ones (original sequences).") + else: + logger.info("-> Folder with sequence file used for annotation ('to_annotate' column): " + "{}".format(tmp_dir)) # If only QC, stop here. if qc_only: @@ -336,13 +344,14 @@ def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont # gsize, nbcont, L90]} # Write lstinfo file (list of genomes kept with info on L90 etc.) utils.write_lstinfo(list_file, kept_genomes, res_dir) - sys.exit(1) # STEP 4. Annotate all kept genomes results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, prodigal_only, small, quiet=quiet) - # List of genomes to format - results_ok = [genome for (genome, ok) in results.items() if ok] + # Information on genomes to format + # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase, + # gsize, nbcont, L90]} + results_ok = {genome:info for (genome,info) in genomes.items() if results[genome]} # If no genome was ok, no need to format them. Just print that no genome was annotated, # end program. if not results_ok: @@ -361,8 +370,8 @@ def main(cmd, list_file, db_path, db_path2, res_dir, name, date, l90=100, nbcont # Initialize list of genomes skipped because something went wrong while formatting. skipped_format = [] # Generate database (folders Proteins, Genes, Replicons, LSTINFO) - skipped_format = ffunc.format_genomes(genomes, results_ok, res_dir, res_annot_dir, - prodigal_only, threads, quiet=quiet) + skipped_format = ffunc.format_genomes(results_ok, res_dir, res_annot_dir, + prodigal_only, threads, quiet=quiet) # At least one genome could not be formatted -> warn user if skipped_format: utils.write_warning_skipped(skipped_format, do_format=True, prodigal_only=prodigal_only, @@ -449,10 +458,10 @@ def build_parser(parser): "genome, you can give this information, to go directly to " "annotation and formatting steps. This file contains at " "least 4 columns, tab separated, with the following headers: " - "'orig_name', 'gsize', 'nb_conts', 'L90'. Any other column " + "'to_annotate', 'gsize', 'nb_conts', 'L90'. Any other column " "will be ignored.") optional.add_argument("-d2", dest="db_path2", - help="Path to 2nd folder containing all multifasta genome files. " + help="Path to 2nd folder containing multifasta genome files. " "Used if you run annotation of sequences found in 2 different " "folders. For example, if you ran this module, and now want to " "rerun it with different parameters, but using the already " diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py index 102de493770f060ef0d97c384ec78c7dbed0b963..eccdf21489c91acf9c764a9e612507e4bd96d980 100755 --- a/PanACoTA/utils.py +++ b/PanACoTA/utils.py @@ -644,7 +644,7 @@ def read_genomes_info(list_file, name, date, dbpath, dbpath2): Parameters ---------- list_file : str - input file containing the list of genomes (filename, genome_name, size, L90, nb_contigs) + input file containing information on genomes (to_annotate, size, L90, nb_contigs) name : str Default species name date : str @@ -665,7 +665,7 @@ def read_genomes_info(list_file, name, date, dbpath, dbpath2): } """ logger = logging.getLogger("utils") - logger.info("Reading given information on your genomes") + logger.info(f"Reading given information on your genomes in {list_file}") genomes = {} spegenus = "{}.{}".format(name, date) column_order = {} # Put the number of column corresponding to each field