Skip to content
Snippets Groups Projects
Commit 5b411ee1 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Add column to lstinfo

Before, only gembase name and original name. Now, name of sequence annotated
parent 66d300bc
No related branches found
No related tags found
No related merge requests found
......@@ -46,7 +46,7 @@ def analyse_all_genomes(genomes, dbpath, tmp_path, nbn, prodigal_only, logger, q
Returns
-------
dict
{genome: [spegenus.date, path, size, nbcont, l90]}
{genome: [spegenus.date, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
"""
cut = nbn > 0
......@@ -113,7 +113,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l
pattern on which contigs must be cut. ex: "NNNNN"
genomes : dict
{genome: [spegenus.date]} as input, and will be changed to\
-> {genome: [spegenus.date, path, gsize, nbcont, L90]}
-> {genome: [spegenus.date, path, path_annotate, gsize, nbcont, L90]}
prodigal_only : bool
True if we annotate with prodigal, False if we annotate with prokka
logger : logging.Logger
......@@ -143,7 +143,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l
#### NEW CONTIG
# Line corresponding to a new contig
if line.startswith(">"):
# If not first contig, write info to output file (of needed)
# If not first contig, write info to output file (if needed)
if cur_seq != "":
num = format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes,
gresf, num, logger)
......@@ -179,11 +179,13 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, prodigal_only, l
if not l90:
logger.error("Problem with genome {}. Not possible to get L90".format(genome))
return False
# Everything ok for this genome -> complete its list of information in genomes dict
# [spegenus.date] -> [spegenus.date, gpath, gpath_to_annotate, gsize, nbcont, L90]}
else:
if grespath:
genomes[genome] += [grespath, gsize, nbcont, l90]
genomes[genome] += [gpath, grespath, gsize, nbcont, l90]
else:
genomes[genome] += [gpath, gsize, nbcont, l90]
genomes[genome] += [gpath, gpath, gsize, nbcont, l90]
# If we wrote a new sequence file, close it
if grespath:
gresf.close()
......@@ -333,7 +335,7 @@ def split_contig(pat, whole_seq, cur_contig_name, contig_sizes, gresf, num):
if len(seq) == 0:
continue
new_contig_name = "{}_{}\n".format(cur_contig_name, num)
contig_sizes[cur_name] = len(seq)
contig_sizes[new_contig_name] = len(seq)
gresf.write(new_contig_name)
gresf.write(seq + "\n")
num += 1
......@@ -371,20 +373,32 @@ def rename_all_genomes(genomes):
Parameters
----------
genomes : dict
{genome: [name, path, gsize, nbcont, L90]} as input, and will become\
{genome: [gembase_name, path, gsize, nbcont, L90]} at the end
{genome: [name, path, path_to_seq, gsize, nbcont, L90]} as input, and will become\
{genome: [gembase_name, path, path_to_seq, gsize, nbcont, L90]} at the end
Return
------
change 1st field of genomes dict. name -> gembase_name (with strain number)
"""
logger.info("Renaming kept genomes according to their quality.")
# Keep previous genome name (ESCO.0109 -> ESCO)
last_name = ""
# Keep last strain number
last_strain = 0
# "SAEN.1015.{}".format(str(last_strain).zfill(5))
for genome, [name, _, _, _, _] in sorted(genomes.items(), key=sort_genomes):
# Sort genomes by species, L90 and nb_contigs
for genome, [name, _, _, _, _, _] in sorted(genomes.items(), key=sort_genomes):
# first genome, or new strain name (ex: ESCO vs EXPL)
# -> keep this new name, and add 1 to next strain number
if last_name != name.split(".")[0]:
last_strain = 1
last_name = name.split(".")[0]
# same strain name
# -> write this new sequence, and go to next one (strain += 1)
else:
last_strain += 1
# Write information to "genomes" dict.
gembase_name = ".".join([name, str(last_strain).zfill(5)])
genomes[genome][0] = gembase_name
......@@ -417,7 +431,7 @@ def plot_distributions(genomes, res_path, listfile_base, l90, nbconts):
Parameters
----------
genomes : dict
{genome: [name, path, size, nbcont, l90]}
{genome: [name, orig_path, to_annotate_path, size, nbcont, l90]}
res_path : str
path to put all output files
listfile_base : str
......@@ -436,18 +450,11 @@ def plot_distributions(genomes, res_path, listfile_base, l90, nbconts):
- dist1 : matplotlib figure of distribution of L90 values
- dist2 : matplotlib figure of distribution of nb contigs values
"""
"""
genomes:
res_path:
listfile_base:
l90: max value of l90 allowed
nbconts: max value of nb contigs allowed
"""
logger.info("Generating distribution of L90 and #contigs graphs.")
l90_vals = [val for _, (_, _, _, _, val) in genomes.items()]
l90_vals = [val for _, (_, _, _, _, _, val) in genomes.items()]
outl90 = os.path.join(res_path, "QC_L90-" + listfile_base + ".png")
nbcont_vals = [val for _, (_, _, _, val, _) in genomes.items()]
nbcont_vals = [val for _, (_, _, _, _, val, _) in genomes.items()]
outnbcont = os.path.join(res_path, "QC_nb-contigs-" + listfile_base + ".png")
dist1 = utils.plot_distr(l90_vals, l90, "L90 distribution for all genomes",
"max L90 =")
......
......@@ -395,11 +395,12 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False):
Parameters
----------
genomes : dict
{genome: [gembase_start_name, seq_file, genome_size, nb_contigs, L90]}
{genome: [gembase_start_name, orig_seq_file, to_annotate_seq_file,
genome_size, nb_contigs, L90]}
kept_genomes : list
list of genomes kept
list_file : str
input file containing the list of genomes
path to input file containing the list of genomes
res_path : str
folder where results must be saved
qc : bool
......@@ -408,40 +409,46 @@ def write_discarded(genomes, kept_genomes, list_file, res_path, qc=False):
"""
logger = logging.getLogger("utils")
# number of genomes discarded
nb_disc = len(genomes) - len(kept_genomes)
# Log number of genomes discarded.
if not qc and nb_disc < 2:
logger.info("{} genome was discarded.".format(nb_disc))
elif not qc:
logger.info("{} genomes were discarded.".format(nb_disc))
# Get input list file name (without path)
_, name_lst = os.path.split(list_file)
# if not QC, write discarded genomes to a file "discarded-[list_file].lst"
if not qc:
outdisc = os.path.join(res_path,
"discarded-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
logger.info("Writing discarded genomes to {}".format(outdisc))
# if QC, there is no 'discarded genome', just write information on all analyzed genomes
else:
outdisc = os.path.join(res_path,
"info-genomes-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
logger.info("Writting information on genomes in {}".format(outdisc))
logger.info("Writing information on genomes in {}".format(outdisc))
with open(outdisc, "w") as outdf:
outdf.write("\t".join(["orig_name", "gsize", "nb_conts", "L90"]) + "\n")
for genome, values in genomes.items():
if genome in kept_genomes:
continue
_, _, gsize, nbcont, l90 = [str(x) for x in values]
_, _, _, gsize, nbcont, l90 = [str(x) for x in values]
outdf.write("\t".join([genome, gsize, nbcont, l90]) + "\n")
def write_lstinfo(list_file, genomes, outdir):
"""
Write lstinfo file, with following columns:
gembase_name, orig_name, size, nbcontigs, l90
gembase_name, orig_name, to_annotate_name, size, nbcontigs, l90
Parameters
----------
list_file : str
input file containing the list of genomes
genomes : dict
{genome: [gembase_start_name, seq_file, genome_size, nb_contigs, L90]}
{genome: [gembase_start_name, seq_file, seq_to_annotate, genome_size, nb_contigs, L90]}
outdir : str
folder where results must be saved
......@@ -450,10 +457,12 @@ def write_lstinfo(list_file, genomes, outdir):
outlst = os.path.join(outdir, "LSTINFO-" + ".".join(name_lst.split(".")[:-1]) + ".lst")
with open(outlst, "w") as outf:
outf.write("\t".join(["gembase_name", "orig_name", "gsize", "nb_conts", "L90"]) + "\n")
outf.write("\t".join(["gembase_name", "orig_name", "to_annotate", "gsize",
"nb_conts", "L90"]) + "\n")
for genome, values in sorted(genomes.items(), key=sort_genomes):
gembase, _, gsize, nbcont, l90 = [str(x) for x in values]
outf.write("\t".join([gembase, genome, gsize, nbcont, l90]) + "\n")
gembase, _, to_annote, gsize, nbcont, l90 = [str(x) for x in values]
to_annote_file = os.path.basename(to_annote)
outf.write("\t".join([gembase, genome, to_annote_file, gsize, nbcont, l90]) + "\n")
def sort_genomes(x):
......@@ -571,6 +580,7 @@ def read_genomes(list_file, name, date, dbpath, tmp_path):
genomes_inf = genomes_inf.strip()
cur_name, cur_date = read_info(name_inf, name, date, genomes_inf)
# If no separator '::', no information on name and date of genome: use default ones
# Line only contains genome name (filename in given db_path)
else:
genomes_inf = line.strip()
cur_name = name
......@@ -640,7 +650,9 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
Returns
-------
dict
genomes = {genome: [spegenus.date, path_to_splitSequence, size, nbcont, l90]}
genomes = {genome:
[spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
}
"""
logger = logging.getLogger("utils")
logger.info("Reading given information on your genomes")
......@@ -690,7 +702,9 @@ def read_genomes_info(list_file, name, date, dbpath, tmp_path):
logger.warning(("{} genome file does not exist in the given database. "
"It will be ignored.".format(gpath)))
continue
genomes[gname] = [spegenus, gpath, gsize, gcont, gl90]
# cur genome information to save:
# [spegenus.date, path_orig_seq, path_to_splitSequence, size, nbcont, l90]
genomes[gname] = [spegenus, gpath, gpath, gsize, gcont, gl90]
return genomes
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment