Skip to content
Snippets Groups Projects
Commit 116db29a authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

end of tests for format_prodigal functions

parent f500aeed
No related branches found
No related tags found
No related merge requests found
Pipeline #36217 passed
......@@ -84,11 +84,11 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
if not contigs:
try:
os.remove(res_rep_file)
os.remove(res_lst_file)
os.remove(res_gff_file)
os.remove(res_gene_file)
os.remove(res_prt_file)
os.remove(res_rep_file)
except OSError:
pass
logger.error("Problems while generating Replicon file for {}".format(name))
......@@ -99,8 +99,11 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, gpath, name)
if not ok:
try:
os.remove(res_rep_file)
os.remove(res_gene_file)
os.remove(res_lst_file)
os.remove(res_gff_file)
os.remove(res_prot_file)
except OSError:
pass
logger.error(f"Problems while generating .gen and .lst files for {name}")
......@@ -116,10 +119,11 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
os.remove(res_lst_file)
os.remove(res_rep_file)
os.remove(res_gff_file)
os.remove(res_prot_file)
except OSError:
pass
logger.error("Problems while generating .fna (Replicons folder) and .gff (gff3 folder) "
"files for {}".format(name))
logger.error("Problems while generating .gff (gff3 folder) "
"file for {}".format(name))
return False
# Generate .prt files (in Proteins directory)
......@@ -135,7 +139,7 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
os.remove(res_prot_file)
except OSError:
pass
logger.error("Problems while generating .prt files (Proteins folder) "
logger.error("Problems while generating .prt file (Proteins folder) "
"for {}".format(name))
return False
return ok
......@@ -209,8 +213,11 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
(gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#")
# Get contig number from prodigal gene header: prodigal first part of header is:
# <original genome name contig name>_<protein number>
contig_name = "_".join(gname.strip().split("_")[:-1])
contig_name = gname.strip().split("_")
if len(contig_name) > 1:
contig_name = "_".join(contig_name[:-1])
else:
contig_name = contig_name[0]
# If new contig:
# - previous gene was the last of its contig -> prev_loc = "b" ;
# - the current gene is the first of its contig (loc = "b")
......@@ -221,7 +228,8 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
contig_num = contigs[contig_name].split(".")[-1]
# if not in the list, problem, return false
else:
logger.error(f"{contig_name} found in {gen_file} does not exist in {gpath}.")
logger.error(f"'{contig_name}' found in {gen_file} does not exist in "
f"{gpath}.")
return False
prev_loc = 'b'
loc = 'b'
......
......@@ -1135,10 +1135,11 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
cont = "\t".join([prev_cont, str(cont_size)]) + "\n"
prevcont_nohead = "".join(prev_cont.split(">")[1:])
prev_orig_name_nohead = "".join(prev_orig_name.split(">")[1:])
sizes[prevcont_nohead] = cont_size
contigs[prev_orig_name_nohead] = prevcont_nohead
grf.write(cont)
grf.write(seq)
if prev_orig_name_nohead:
sizes[prevcont_nohead] = cont_size
contigs[prev_orig_name_nohead] = prevcont_nohead
grf.write(cont)
grf.write(seq)
prev_cont = ">" + gembase_name + "." + str(contig_num).zfill(4)
prev_orig_name = line.strip()
contig_num += 1
......@@ -1152,10 +1153,11 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile):
cont = "\t".join([prev_cont, str(cont_size)]) + "\n"
prevcont_nohead = "".join(prev_cont.split(">")[1:])
prev_orig_name_nohead = "".join(prev_orig_name.split(">")[1:])
contigs[prev_orig_name_nohead] = prevcont_nohead
sizes[prevcont_nohead] = cont_size
grf.write(cont)
grf.write(seq)
if prev_orig_name_nohead:
contigs[prev_orig_name_nohead] = prevcont_nohead
sizes[prevcont_nohead] = cont_size
grf.write(cont)
grf.write(seq)
return contigs, sizes
......
......@@ -108,7 +108,7 @@ def test_create_gen_lst_cont_unknown(caplog):
gpath = "original_genome_name"
assert not prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file,
gpath, name)
assert ("my_contig found in test/data/annotate/test_files/original_name.fna-prodigalRes/"
assert ("'my_contig' found in test/data/annotate/test_files/original_name.fna-prodigalRes/"
"prodigal.outtest.ok.faa does not exist in original_genome_name") in caplog.text
......@@ -292,9 +292,191 @@ def test_format_1genome(caplog):
os.makedirs(gff_dir)
assert prodigalfunc.format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
rep_dir, gff_dir, logger)
rep_dir, gff_dir, logger)
def test_format_1genome_emptygpath(caplog):
"""
Test on formatting prodigal results, when ffn file is empty -> error message,
and no file generated
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
name = "prodigal.outtest.ok"
# Create empty file, that we give to prodigal for formatting step
gpath = os.path.join(GENEPATH, "original_name-empty.fna")
open(gpath, "w").close()
prod_path = TEST_ANNOTE
prot_dir = os.path.join(GENEPATH, "Proteins")
lst_dir = os.path.join(GENEPATH, "LSTINFO")
rep_dir = os.path.join(GENEPATH, "Replicons")
gene_dir = os.path.join(GENEPATH, "Genes")
gff_dir = os.path.join(GENEPATH, "gff")
os.makedirs(rep_dir)
assert not prodigalfunc.format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
rep_dir, gff_dir, logger)
assert ("Problems while generating Replicon file for prodigal.outtest.ok") in caplog.text
def test_format_1genome_wrongffn(caplog):
"""
Test on formatting prodigal results, when the file given to prodigal was empty
-> error message,
-> no file generated
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
name = "prodigal.outtest.ok"
# path to original genome, given to prodigal for annotation
orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna")
# In generated_by_tests folder, create the original genome given to prodigal
# (copy from test_file)
used_gpath = os.path.join(GENEPATH, "original_name.fna")
used_respath = used_gpath + "-prodigalRes"
os.makedirs(used_respath)
shutil.copyfile(orig_gpath, used_gpath)
# Create gen_file with a header not existing
with open(os.path.join(used_respath, "prodigal.outtest.ok.ffn"), "w") as ori:
ori.write(">wrongheader # 1 # 2 # 1 # toto")
prod_path = GENEPATH
prot_dir = os.path.join(GENEPATH, "Proteins")
lst_dir = os.path.join(GENEPATH, "LSTINFO")
rep_dir = os.path.join(GENEPATH, "Replicons")
gene_dir = os.path.join(GENEPATH, "Genes")
gff_dir = os.path.join(GENEPATH, "gff")
os.makedirs(rep_dir)
os.makedirs(gene_dir)
os.makedirs(lst_dir)
assert not prodigalfunc.format_one_genome(used_gpath, name, prod_path, lst_dir, prot_dir,
gene_dir, rep_dir, gff_dir, logger)
# # Check that replicon file was removed
# assert len(os.listdir(rep_dir) ) == 0
assert ("'wrongheader' found in test/data/annotate/generated_by_unit-tests/"
"original_name.fna-prodigalRes/prodigal.outtest.ok.ffn does not exist in "
"test/data/annotate/generated_by_unit-tests/original_name.fna") in caplog.text
assert ("Problems while generating .gen and .lst files for prodigal.outtest.ok") in caplog.text
def test_format_1genome_wronglst(caplog):
"""
Test on formatting prodigal results, when the file given to prodigal was empty
-> error message,
-> no file generated
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
name = "prodigal.outtest.ok"
# path to original genome, given to prodigal for annotation
orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna")
# In generated_by_tests folder, create the original genome given to prodigal
# (copy from test_file)
used_gpath = os.path.join(GENEPATH, "original_name.fna")
used_respath = used_gpath + "-prodigalRes"
os.makedirs(used_respath)
shutil.copyfile(orig_gpath, used_gpath)
# Copy ffn file generated by prodigal:
orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn")
used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn")
shutil.copyfile(orig_ffn, used_ffn)
# Copy gff file, but modify to get wrong start position
orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff")
used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff")
with open(orig_gff, "r") as orig, open(used_gff, "w") as gffw:
for line in orig:
if line.startswith("#"):
gffw.write(line)
else:
fields_g = line.split("\t")
(contig_name, source, type_g, start_g, end_g,
score, strand_g, phase, attributes) = fields_g
gffw.write("\t".join([contig_name, source, type_g, "12", end_g, score, strand_g,
phase, attributes]))
break
prod_path = GENEPATH
prot_dir = os.path.join(GENEPATH, "Proteins")
lst_dir = os.path.join(GENEPATH, "LSTINFO")
rep_dir = os.path.join(GENEPATH, "Replicons")
gene_dir = os.path.join(GENEPATH, "Genes")
gff_dir = os.path.join(GENEPATH, "gff3")
os.makedirs(rep_dir)
os.makedirs(gene_dir)
os.makedirs(lst_dir)
os.makedirs(gff_dir)
# Copy generated lstfile, but modify first line to have a difference between gff and lst starts
assert not prodigalfunc.format_one_genome(used_gpath, name, prod_path, lst_dir, prot_dir,
gene_dir, rep_dir, gff_dir, logger)
# # # Check that replicon file was removed
# # assert len(os.listdir(rep_dir) ) == 0
assert ("Files prodigal.outtest.ok.ffn and prodigal.outtest.ok.gff (in prodigal tmp_files: "
"test/data/annotate/generated_by_unit-tests/original_name.fna-prodigalRes) do "
"not have the same start value for gene EPKOMDHM_00001 "
"(12 in gff, 287 in ffn)") in caplog.text
assert ("Problems while generating .gff (gff3 folder) file for "
"prodigal.outtest.ok") in caplog.text
def test_format_1genome_wrongprt(caplog):
"""
Test on formatting prodigal results, when the file given to prodigal was empty
-> error message,
-> no file generated
"""
caplog.set_level(logging.DEBUG)
logger = logging.getLogger("test_prodigal")
name = "prodigal.outtest.ok"
# path to original genome, given to prodigal for annotation
orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna")
# In generated_by_tests folder, create the original genome given to prodigal
# (copy from test_file)
used_gpath = os.path.join(GENEPATH, "original_name.fna")
used_respath = used_gpath + "-prodigalRes"
os.makedirs(used_respath)
shutil.copyfile(orig_gpath, used_gpath)
# Copy ffn file generated by prodigal:
orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn")
used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn")
shutil.copyfile(orig_ffn, used_ffn)
# Copy gff file generated by prodigal:
orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff")
used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff")
shutil.copyfile(orig_gff, used_gff)
# Copy prt file, but removing first protein
orig_faa = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.faa")
used_faa = os.path.join(used_respath, "prodigal.outtest.ok.faa")
with open(orig_faa, "r") as faa, open(used_faa, "w") as faar:
for i in range(7):
faa.readline()
for line in faa:
faar.write(line)
prod_path = GENEPATH
prot_dir = os.path.join(GENEPATH, "Proteins")
lst_dir = os.path.join(GENEPATH, "LSTINFO")
rep_dir = os.path.join(GENEPATH, "Replicons")
gene_dir = os.path.join(GENEPATH, "Genes")
gff_dir = os.path.join(GENEPATH, "gff3")
gff_dir = os.path.join(GENEPATH, "Proteins")
os.makedirs(rep_dir)
os.makedirs(gene_dir)
os.makedirs(lst_dir)
os.makedirs(gff_dir)
# Copy generated lstfile, but modify first line to have a difference between gff and lst starts
assert not prodigalfunc.format_one_genome(used_gpath, name, prod_path, lst_dir, prot_dir,
gene_dir, rep_dir, gff_dir, logger)
# # # Check that replicon file was removed
# # assert len(os.listdir(rep_dir) ) == 0
assert ("Protein prodigal.outtest.ok.0007b_00013 is in .lst file but its sequence is not in "
"the protein file generated by prodigal.") in caplog.text
assert ("Problems while generating .prt file (Proteins folder) for "
"prodigal.outtest.ok") in caplog.text
# # def test_tbl_to_lst_not_changed_names(caplog):
# """
# Check that generated lstinfo file is as expected, when the genome name is the same as
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment