diff --git a/PanACoTA/annotate_module/format_prodigal.py b/PanACoTA/annotate_module/format_prodigal.py index 87c1fd08f70263f0ac354beb9da88d96b9fe8be6..40f5cc55546d47d32d5dd5f427f2b9ef99826b46 100644 --- a/PanACoTA/annotate_module/format_prodigal.py +++ b/PanACoTA/annotate_module/format_prodigal.py @@ -84,11 +84,11 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file) if not contigs: try: + os.remove(res_rep_file) os.remove(res_lst_file) os.remove(res_gff_file) os.remove(res_gene_file) os.remove(res_prt_file) - os.remove(res_rep_file) except OSError: pass logger.error("Problems while generating Replicon file for {}".format(name)) @@ -99,8 +99,11 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, gpath, name) if not ok: try: + os.remove(res_rep_file) os.remove(res_gene_file) os.remove(res_lst_file) + os.remove(res_gff_file) + os.remove(res_prot_file) except OSError: pass logger.error(f"Problems while generating .gen and .lst files for {name}") @@ -116,10 +119,11 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, os.remove(res_lst_file) os.remove(res_rep_file) os.remove(res_gff_file) + os.remove(res_prot_file) except OSError: pass - logger.error("Problems while generating .fna (Replicons folder) and .gff (gff3 folder) " - "files for {}".format(name)) + logger.error("Problems while generating .gff (gff3 folder) " + "file for {}".format(name)) return False # Generate .prt files (in Proteins directory) @@ -135,7 +139,7 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, os.remove(res_prot_file) except OSError: pass - logger.error("Problems while generating .prt files (Proteins folder) " + logger.error("Problems while generating .prt file (Proteins folder) " "for {}".format(name)) return False return ok @@ -209,8 +213,11 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): (gname, start, end, strand, info) = lineffn.strip().split(">")[-1].split("#") # Get contig number from prodigal gene header: prodigal first part of header is: # <original genome name contig name>_<protein number> - contig_name = "_".join(gname.strip().split("_")[:-1]) - + contig_name = gname.strip().split("_") + if len(contig_name) > 1: + contig_name = "_".join(contig_name[:-1]) + else: + contig_name = contig_name[0] # If new contig: # - previous gene was the last of its contig -> prev_loc = "b" ; # - the current gene is the first of its contig (loc = "b") @@ -221,7 +228,8 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name): contig_num = contigs[contig_name].split(".")[-1] # if not in the list, problem, return false else: - logger.error(f"{contig_name} found in {gen_file} does not exist in {gpath}.") + logger.error(f"'{contig_name}' found in {gen_file} does not exist in " + f"{gpath}.") return False prev_loc = 'b' loc = 'b' diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py index 69e51888e3eba396a58e507a8a7b1aba5f6034e0..36957c3fc41865e93d4291befd7c2bc11f73db5d 100755 --- a/PanACoTA/utils.py +++ b/PanACoTA/utils.py @@ -1135,10 +1135,11 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile): cont = "\t".join([prev_cont, str(cont_size)]) + "\n" prevcont_nohead = "".join(prev_cont.split(">")[1:]) prev_orig_name_nohead = "".join(prev_orig_name.split(">")[1:]) - sizes[prevcont_nohead] = cont_size - contigs[prev_orig_name_nohead] = prevcont_nohead - grf.write(cont) - grf.write(seq) + if prev_orig_name_nohead: + sizes[prevcont_nohead] = cont_size + contigs[prev_orig_name_nohead] = prevcont_nohead + grf.write(cont) + grf.write(seq) prev_cont = ">" + gembase_name + "." + str(contig_num).zfill(4) prev_orig_name = line.strip() contig_num += 1 @@ -1152,10 +1153,11 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile): cont = "\t".join([prev_cont, str(cont_size)]) + "\n" prevcont_nohead = "".join(prev_cont.split(">")[1:]) prev_orig_name_nohead = "".join(prev_orig_name.split(">")[1:]) - contigs[prev_orig_name_nohead] = prevcont_nohead - sizes[prevcont_nohead] = cont_size - grf.write(cont) - grf.write(seq) + if prev_orig_name_nohead: + contigs[prev_orig_name_nohead] = prevcont_nohead + sizes[prevcont_nohead] = cont_size + grf.write(cont) + grf.write(seq) return contigs, sizes diff --git a/test/test_unit/test_annotate/test_format_prodigal.py b/test/test_unit/test_annotate/test_format_prodigal.py index 5ddd92b10d18095b35f797dfbda78e0668c3c10c..31a3faee8a23efe7b87130bb76c211a3eea712d2 100644 --- a/test/test_unit/test_annotate/test_format_prodigal.py +++ b/test/test_unit/test_annotate/test_format_prodigal.py @@ -108,7 +108,7 @@ def test_create_gen_lst_cont_unknown(caplog): gpath = "original_genome_name" assert not prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name) - assert ("my_contig found in test/data/annotate/test_files/original_name.fna-prodigalRes/" + assert ("'my_contig' found in test/data/annotate/test_files/original_name.fna-prodigalRes/" "prodigal.outtest.ok.faa does not exist in original_genome_name") in caplog.text @@ -292,9 +292,191 @@ def test_format_1genome(caplog): os.makedirs(gff_dir) assert prodigalfunc.format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, - rep_dir, gff_dir, logger) + rep_dir, gff_dir, logger) +def test_format_1genome_emptygpath(caplog): + """ + Test on formatting prodigal results, when ffn file is empty -> error message, + and no file generated + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + name = "prodigal.outtest.ok" + # Create empty file, that we give to prodigal for formatting step + gpath = os.path.join(GENEPATH, "original_name-empty.fna") + open(gpath, "w").close() + prod_path = TEST_ANNOTE + prot_dir = os.path.join(GENEPATH, "Proteins") + lst_dir = os.path.join(GENEPATH, "LSTINFO") + rep_dir = os.path.join(GENEPATH, "Replicons") + gene_dir = os.path.join(GENEPATH, "Genes") + gff_dir = os.path.join(GENEPATH, "gff") + os.makedirs(rep_dir) + + assert not prodigalfunc.format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir, + rep_dir, gff_dir, logger) + assert ("Problems while generating Replicon file for prodigal.outtest.ok") in caplog.text + + +def test_format_1genome_wrongffn(caplog): + """ + Test on formatting prodigal results, when the file given to prodigal was empty + -> error message, + -> no file generated + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + name = "prodigal.outtest.ok" + # path to original genome, given to prodigal for annotation + orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") + # In generated_by_tests folder, create the original genome given to prodigal + # (copy from test_file) + used_gpath = os.path.join(GENEPATH, "original_name.fna") + used_respath = used_gpath + "-prodigalRes" + os.makedirs(used_respath) + shutil.copyfile(orig_gpath, used_gpath) + # Create gen_file with a header not existing + with open(os.path.join(used_respath, "prodigal.outtest.ok.ffn"), "w") as ori: + ori.write(">wrongheader # 1 # 2 # 1 # toto") + prod_path = GENEPATH + prot_dir = os.path.join(GENEPATH, "Proteins") + lst_dir = os.path.join(GENEPATH, "LSTINFO") + rep_dir = os.path.join(GENEPATH, "Replicons") + gene_dir = os.path.join(GENEPATH, "Genes") + gff_dir = os.path.join(GENEPATH, "gff") + os.makedirs(rep_dir) + os.makedirs(gene_dir) + os.makedirs(lst_dir) + + assert not prodigalfunc.format_one_genome(used_gpath, name, prod_path, lst_dir, prot_dir, + gene_dir, rep_dir, gff_dir, logger) + # # Check that replicon file was removed + # assert len(os.listdir(rep_dir) ) == 0 + assert ("'wrongheader' found in test/data/annotate/generated_by_unit-tests/" + "original_name.fna-prodigalRes/prodigal.outtest.ok.ffn does not exist in " + "test/data/annotate/generated_by_unit-tests/original_name.fna") in caplog.text + assert ("Problems while generating .gen and .lst files for prodigal.outtest.ok") in caplog.text + + +def test_format_1genome_wronglst(caplog): + """ + Test on formatting prodigal results, when the file given to prodigal was empty + -> error message, + -> no file generated + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + name = "prodigal.outtest.ok" + + # path to original genome, given to prodigal for annotation + orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") + # In generated_by_tests folder, create the original genome given to prodigal + # (copy from test_file) + used_gpath = os.path.join(GENEPATH, "original_name.fna") + used_respath = used_gpath + "-prodigalRes" + os.makedirs(used_respath) + shutil.copyfile(orig_gpath, used_gpath) + # Copy ffn file generated by prodigal: + orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn") + used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn") + shutil.copyfile(orig_ffn, used_ffn) + # Copy gff file, but modify to get wrong start position + orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff") + used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff") + with open(orig_gff, "r") as orig, open(used_gff, "w") as gffw: + for line in orig: + if line.startswith("#"): + gffw.write(line) + else: + fields_g = line.split("\t") + (contig_name, source, type_g, start_g, end_g, + score, strand_g, phase, attributes) = fields_g + gffw.write("\t".join([contig_name, source, type_g, "12", end_g, score, strand_g, + phase, attributes])) + break + prod_path = GENEPATH + prot_dir = os.path.join(GENEPATH, "Proteins") + lst_dir = os.path.join(GENEPATH, "LSTINFO") + rep_dir = os.path.join(GENEPATH, "Replicons") + gene_dir = os.path.join(GENEPATH, "Genes") + gff_dir = os.path.join(GENEPATH, "gff3") + os.makedirs(rep_dir) + os.makedirs(gene_dir) + os.makedirs(lst_dir) + os.makedirs(gff_dir) + # Copy generated lstfile, but modify first line to have a difference between gff and lst starts + + assert not prodigalfunc.format_one_genome(used_gpath, name, prod_path, lst_dir, prot_dir, + gene_dir, rep_dir, gff_dir, logger) + # # # Check that replicon file was removed + # # assert len(os.listdir(rep_dir) ) == 0 + assert ("Files prodigal.outtest.ok.ffn and prodigal.outtest.ok.gff (in prodigal tmp_files: " + "test/data/annotate/generated_by_unit-tests/original_name.fna-prodigalRes) do " + "not have the same start value for gene EPKOMDHM_00001 " + "(12 in gff, 287 in ffn)") in caplog.text + assert ("Problems while generating .gff (gff3 folder) file for " + "prodigal.outtest.ok") in caplog.text + + + +def test_format_1genome_wrongprt(caplog): + """ + Test on formatting prodigal results, when the file given to prodigal was empty + -> error message, + -> no file generated + """ + caplog.set_level(logging.DEBUG) + logger = logging.getLogger("test_prodigal") + name = "prodigal.outtest.ok" + + # path to original genome, given to prodigal for annotation + orig_gpath = os.path.join(TEST_ANNOTE, "original_name.fna") + # In generated_by_tests folder, create the original genome given to prodigal + # (copy from test_file) + used_gpath = os.path.join(GENEPATH, "original_name.fna") + used_respath = used_gpath + "-prodigalRes" + os.makedirs(used_respath) + shutil.copyfile(orig_gpath, used_gpath) + # Copy ffn file generated by prodigal: + orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn") + used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn") + shutil.copyfile(orig_ffn, used_ffn) + # Copy gff file generated by prodigal: + orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff") + used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff") + shutil.copyfile(orig_gff, used_gff) + # Copy prt file, but removing first protein + orig_faa = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.faa") + used_faa = os.path.join(used_respath, "prodigal.outtest.ok.faa") + with open(orig_faa, "r") as faa, open(used_faa, "w") as faar: + for i in range(7): + faa.readline() + for line in faa: + faar.write(line) + + prod_path = GENEPATH + prot_dir = os.path.join(GENEPATH, "Proteins") + lst_dir = os.path.join(GENEPATH, "LSTINFO") + rep_dir = os.path.join(GENEPATH, "Replicons") + gene_dir = os.path.join(GENEPATH, "Genes") + gff_dir = os.path.join(GENEPATH, "gff3") + gff_dir = os.path.join(GENEPATH, "Proteins") + os.makedirs(rep_dir) + os.makedirs(gene_dir) + os.makedirs(lst_dir) + os.makedirs(gff_dir) + # Copy generated lstfile, but modify first line to have a difference between gff and lst starts + + assert not prodigalfunc.format_one_genome(used_gpath, name, prod_path, lst_dir, prot_dir, + gene_dir, rep_dir, gff_dir, logger) + # # # Check that replicon file was removed + # # assert len(os.listdir(rep_dir) ) == 0 + assert ("Protein prodigal.outtest.ok.0007b_00013 is in .lst file but its sequence is not in " + "the protein file generated by prodigal.") in caplog.text + assert ("Problems while generating .prt file (Proteins folder) for " + "prodigal.outtest.ok") in caplog.text + # # def test_tbl_to_lst_not_changed_names(caplog): # """ # Check that generated lstinfo file is as expected, when the genome name is the same as