diff --git a/PanACoTA/utils.py b/PanACoTA/utils.py index 04969397cacc5db5a4153e9559bdb4e89e2e118b..4d51eecffea54da35762d47778cf6eced939526e 100755 --- a/PanACoTA/utils.py +++ b/PanACoTA/utils.py @@ -1150,6 +1150,10 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger): prevcont_nohead = prev_cont.split(">")[1] prev_orig_name_nohead = prev_orig_name.split(">")[1] if prev_orig_name_nohead: + if prev_orig_name_nohead in contigs: + logger.error(f"several contigs have the same name " + f"{prev_orig_name_nohead} in {gpath}.") + return False, False sizes[prevcont_nohead] = cont_size contigs[prev_orig_name_nohead] = prevcont_nohead grf.write(cont) @@ -1170,7 +1174,8 @@ def get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger): prev_orig_name_nohead = prev_orig_name.split(">")[1] if prev_orig_name_nohead: if prev_orig_name_nohead in contigs: - logger.error(f"several contigs have the same name {prev_cont} in {gpath}.") + logger.error(f"several contigs have the same name {prev_orig_name_nohead} " + f"in {gpath}.") return False, False contigs[prev_orig_name_nohead] = prevcont_nohead sizes[prevcont_nohead] = cont_size diff --git a/test/test_unit/test_utils.py b/test/test_unit/test_utils.py index 57fffe61ae21535dfb108c9341bb3f00689f27d5..a1a7970765f62efa48d05532ba63b7b9efc513d4 100755 --- a/test/test_unit/test_utils.py +++ b/test/test_unit/test_utils.py @@ -918,6 +918,50 @@ def test_rename_contigs(): assert utilities.compare_order_content(outfile, exp_file) +def test_rename_contigs_duplicate(caplog): + """ + From a given sequence, there are 2 contigs named "contig2". Stops and returns false + """ + logger = logging.getLogger("default") + gpath = os.path.join(DATA_DIR, "genomes", "genome-duplicated-header.fasta") + gembase_name = "ESCO.0216.00005" + outfile = os.path.join(GENEPATH, "genome_dup_error.fna") + exp_file = os.path.join(DATA_DIR, "exp_files", "res_H299_H561-ESCO00005.fna") + contigs, sizes = utils.get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger) + assert not contigs + assert not sizes + with open(outfile, "r") as of: + assert of.readline().startswith(">ESCO.0216.00005.0001") + of.readline() # skip sequence + assert of.readline().startswith(">ESCO.0216.00005.0002") + of.readline() # skip sequence + assert of.readline().startswith(">ESCO.0216.00005.0003") + assert ("several contigs have the same name contig2 in test/data/annotate/genomes/" + "genome-duplicated-header.fasta.") in caplog.text + + +def test_rename_contigs_duplicate_last(caplog): + """ + The last contig of the sequence has the same name as a previous contig. Stops and returns false + """ + logger = logging.getLogger("default") + gpath = os.path.join(DATA_DIR, "genomes", "genome-duplicated-header-last.fasta") + gembase_name = "ESCO.0216.00005" + outfile = os.path.join(GENEPATH, "genome_dup_error.fna") + exp_file = os.path.join(DATA_DIR, "exp_files", "res_H299_H561-ESCO00005.fna") + contigs, sizes = utils.get_genome_contigs_and_rename(gembase_name, gpath, outfile, logger) + assert not contigs + assert not sizes + with open(outfile, "r") as of: + assert of.readline().startswith(">ESCO.0216.00005.0001") + of.readline() # skip sequence + assert of.readline().startswith(">ESCO.0216.00005.0002") + of.readline() # skip sequence + assert of.readline().startswith(">ESCO.0216.00005.0003") + assert ("several contigs have the same name contig2 in test/data/annotate/genomes/" + "genome-duplicated-header-last.fasta.") in caplog.text + + def test_cat_nobar(): """ Check that when cat is called on a list of several files, the output file