diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py index d9f1c2d93d0e67d03c15ad5b21008c8c6f8bcde3..286f5f9508fbde0f98d2789701214f29be4569f7 100755 --- a/PanACoTA/annotate_module/genome_seq_functions.py +++ b/PanACoTA/annotate_module/genome_seq_functions.py @@ -46,7 +46,7 @@ def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, prodigal_only, logger, q Returns ------- dict - {genome: [spegenus.date, path, size, nbcont, l90]} + {genome: [spegenus.date, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} """ cut = nbn > 0 @@ -113,7 +113,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l pattern on which contigs must be cut. ex: "NNNNN" genomes : dict {genome: [spegenus.date]} as input, and will be changed to\ - -> {genome: [spegenus.date, path, gsize, nbcont, L90]} + -> {genome: [spegenus.date, path, path_annotate, gsize, nbcont, L90]} prodigal_only : bool True if we annotate with prodigal, False if we annotate with prokka logger : logging.Logger @@ -143,7 +143,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l #### NEW CONTIG # Line corresponding to a new contig if line.startswith(">"): - # If not first contig, write info to output file (of needed) + # If not first contig, write info to output file (if needed) if cur_seq != "": num = format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, logger) @@ -179,11 +179,13 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l if not l90: logger.error("Problem with genome {}. Not possible to get L90".format(genome)) return False + # Everything ok for this genome -> complete its list of information in genomes dict + # [spegenus.date] -> [spegenus.date, gpath, gpath_to_annotate, gsize, nbcont, L90]} else: if grespath: - genomes[genome] += [grespath, gsize, nbcont, l90] + genomes[genome] += [gpath, grespath, gsize, nbcont, l90] else: - genomes[genome] += [gpath, gsize, nbcont, l90] + genomes[genome] += [gpath, gpath, gsize, nbcont, l90] # If we wrote a new sequence file, close it if grespath: gresf.close() @@ -333,7 +335,7 @@ def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num): if len(seq) == 0: continue new_contig_name = "{}_{}\n".format(cur_contig_name, num) - contig_sizes[cur_name] = len(seq) + contig_sizes[new_contig_name] = len(seq) gresf.write(new_contig_name) gresf.write(seq + "\n") num += 1 @@ -371,20 +373,32 @@ def rename_all_genomes(genomes): Parameters ---------- genomes : dict - {genome: [name, path, gsize, nbcont, L90]} as input, and will become\ - {genome: [gembase_name, path, gsize, nbcont, L90]} at the end + {genome: [name, path, path_to_seq, gsize, nbcont, L90]} as input, and will become\ + {genome: [gembase_name, path, path_to_seq, gsize, nbcont, L90]} at the end + + Return + ------ + change 1st field of genomes dict. name -> gembase_name (with strain number) """ logger.info("Renaming kept genomes according to their quality.") + # Keep previous genome name (ESCO.0109 -> ESCO) last_name = "" + # Keep last strain number last_strain = 0 # "SAEN.1015.{}".format(str(last_strain).zfill(5)) - for genome, [name, _, _, _, _] in sorted(genomes.items(), key=sort_genomes): + # Sort genomes by species, L90 and nb_contigs + for genome, [name, _, _, _, _, _] in sorted(genomes.items(), key=sort_genomes): + # first genome, or new strain name (ex: ESCO vs EXPL) + # -> keep this new name, and add 1 to next strain number if last_name != name.split(".")[0]: last_strain = 1 last_name = name.split(".")[0] + # same strain name + # -> write this new sequence, and go to next one (strain += 1) else: last_strain += 1 + # Write information to "genomes" dict. gembase_name = ".".join([name, str(last_strain).zfill(5)]) genomes[genome][0] = gembase_name @@ -417,7 +431,7 @@ def plot_distributions(genomes, res_path, listfile_base, l90, nbconts): Parameters ---------- genomes : dict - {genome: [name, path, size, nbcont, l90]} + {genome: [name, orig_path, to_annotate_path, size, nbcont, l90]} res_path : str path to put all output files listfile_base : str @@ -436,18 +450,11 @@ def plot_distributions(genomes, res_path, listfile_base, l90, nbconts): - dist1 : matplotlib figure of distribution of L90 values - dist2 : matplotlib figure of distribution of nb contigs values - """ - """ - genomes: - res_path: - listfile_base: - l90: max value of l90 allowed - nbconts: max value of nb contigs allowed """ logger.info("Generating distribution of L90 and #contigs graphs.") - l90_vals = [val for _, (_, _, _, _, val) in genomes.items()] + l90_vals = [val for _, (_, _, _, _, _, val) in genomes.items()] outl90 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png") - nbcont_vals = [val for _, (_, _, _, val, _) in genomes.items()] + nbcont_vals = [val for _, (_, _, _, _, val, _) in genomes.items()] outnbcont = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png") dist1 = utils.plot_distr(l90_vals, l90, "L90 distribution for all genomes", "max L90 =") diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py index 7e847c27b9485273d7a0a745d0a6cc77c5a888f5..0246054d300a3ca87aada5707d2943e4eb69b0f5 100755 --- a/PanACoTA/utils.py +++ b/PanACoTA/utils.py @@ -395,11 +395,12 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False): Parameters ---------- genomes : dict - {genome: [gembase_start_name, seq_file, genome_size, nb_contigs, L90]} + {genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file, + genome_size, nb_contigs, L90]} kept_genomes : list list of genomes kept list_file : str - input file containing the list of genomes + path to input file containing the list of genomes res_path : str folder where results must be saved qc : bool @@ -408,40 +409,46 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False): """ logger = logging.getLogger("utils") + # number of genomes discarded nb_disc = len(genomes) - len(kept_genomes) + # Log number of genomes discarded. if not qc and nb_disc < 2: logger.info("{} genome was discarded.".format(nb_disc)) elif not qc: logger.info("{} genomes were discarded.".format(nb_disc)) + # Get input list file name (without path) _, name_lst = os.path.split(list_file) + + # if not QC, write discarded genomes to a file "discarded-[list_file].lst" if not qc: outdisc = os.path.join(res_path, "discarded-" + ".".join(name_lst.split(".")[:-1]) + ".lst") logger.info("Writing discarded genomes to {}".format(outdisc)) + # if QC, there is no 'discarded genome', just write information on all analyzed genomes else: outdisc = os.path.join(res_path, "info-genomes-" + ".".join(name_lst.split(".")[:-1]) + ".lst") - logger.info("Writting information on genomes in {}".format(outdisc)) + logger.info("Writing information on genomes in {}".format(outdisc)) with open(outdisc, "w") as outdf: outdf.write("\t".join(["orig_name", "gsize", "nb_conts", "L90"]) + "\n") for genome, values in genomes.items(): if genome in kept_genomes: continue - _, _, gsize, nbcont, l90 = [str(x) for x in values] + _, _, _, gsize, nbcont, l90 = [str(x) for x in values] outdf.write("\t".join([genome, gsize, nbcont, l90]) + "\n") def write_lstinfo(list_file, genomes, outdir): """ Write lstinfo file, with following columns: - gembase_name, orig_name, size, nbcontigs, l90 + gembase_name, orig_name, to_annotate_name, size, nbcontigs, l90 Parameters ---------- list_file : str input file containing the list of genomes genomes : dict - {genome: [gembase_start_name, seq_file, genome_size, nb_contigs, L90]} + {genome: [gembase_start_name, seq_file, seq_to_annotate, genome_size, nb_contigs, L90]} outdir : str folder where results must be saved @@ -450,10 +457,12 @@ def write_lstinfo(list_file, genomes, outdir): outlst = os.path.join(outdir, "LSTINFO-" + ".".join(name_lst.split(".")[:-1]) + ".lst") with open(outlst, "w") as outf: - outf.write("\t".join(["gembase_name", "orig_name", "gsize", "nb_conts", "L90"]) + "\n") + outf.write("\t".join(["gembase_name", "orig_name", "to_annotate", "gsize", + "nb_conts", "L90"]) + "\n") for genome, values in sorted(genomes.items(), key=sort_genomes): - gembase, _, gsize, nbcont, l90 = [str(x) for x in values] - outf.write("\t".join([gembase, genome, gsize, nbcont, l90]) + "\n") + gembase, _, to_annote, gsize, nbcont, l90 = [str(x) for x in values] + to_annote_file = os.path.basename(to_annote) + outf.write("\t".join([gembase, genome, to_annote_file, gsize, nbcont, l90]) + "\n") def sort_genomes(x): @@ -571,6 +580,7 @@ def read_genomes(list_file, name, date, dbpath, tmp_path): genomes_inf = genomes_inf.strip() cur_name, cur_date = read_info(name_inf, name, date, genomes_inf) # If no separator '::', no information on name and date of genome: use default ones + # Line only contains genome name (filename in given db_path) else: genomes_inf = line.strip() cur_name = name @@ -640,7 +650,9 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): Returns ------- dict - genomes = {genome: [spegenus.date, path_to_splitSequence, size, nbcont, l90]} + genomes = {genome: + [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90] + } """ logger = logging.getLogger("utils") logger.info("Reading given information on your genomes") @@ -690,7 +702,9 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path): logger.warning(("{} genome file does not exist in the given database. " "It will be ignored.".format(gpath))) continue - genomes[gname] = [spegenus, gpath, gsize, gcont, gl90] + # cur genome information to save: + # [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90] + genomes[gname] = [spegenus, gpath, gpath, gsize, gcont, gl90] return genomes