diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py index 6ec2bb814e86dd8dcbe171be5f07212d85418e83..dbb8a2b3a006bd9f78e29e65fd1b0accd95757a1 100755 --- a/PanACoTA/annotate_module/genome_seq_functions.py +++ b/PanACoTA/annotate_module/genome_seq_functions.py @@ -310,16 +310,7 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, if cut: # Cut sequence and write header + sequence to res file num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num) - # PROKKA User does not want to cut, but will annotate with prokka, so we still - # have to create a new sequence file - elif gresf: - new_contig_name = "{}_{}\n".format(cur_contig_name, num) - gresf.write(new_contig_name) - gresf.write(cur_seq + "\n") - contig_sizes[new_contig_name] = len(cur_seq) - num += 1 - # PRODIGAL No cut, and prodigal used -> no new file created, but check - # contig unique names + # No cut -> no new file created, but check contig unique names else: if cur_contig_name in contig_sizes.keys(): logger.error("{} contig name is used for several contigs. Please put " diff --git a/test/data/annotate/exp_files/genome2-split5N.fna b/test/data/annotate/exp_files/genome2-split5N.fna index e58ea8921d9107096f6e753605306c43b67174bc..f2053a5ebcd7fcf3cb453ae93c59a1aef67571c8 100644 --- a/test/data/annotate/exp_files/genome2-split5N.fna +++ b/test/data/annotate/exp_files/genome2-split5N.fna @@ -1,10 +1,10 @@ ->contig1 dgfdgd_1 +>1_contig1 dgfdgd ACGTTGCTGC ->contig1 dgfdgd_2 +>2_contig1 dgfdgd AGCTGTCTAG ->contig2_3 +>3_contig2 CGACGNNCGAG ->contig2_4 +>4_contig2 AGGTG ->contig3_5 +>5_contig3 ANNNNGGCTTGAGGTTGAA diff --git a/test/data/annotate/exp_files/res_genome_short-long_header.fst b/test/data/annotate/exp_files/res_genome_short-long_header.fst index 4a61e3778bead6eea69d1b93760ecd2c1a6bbf0d..aaccf955b76f8a50a51a6f81d95ce1860ee107f0 100755 --- a/test/data/annotate/exp_files/res_genome_short-long_header.fst +++ b/test/data/annotate/exp_files/res_genome_short-long_header.fst @@ -1,6 +1,6 @@ ->Long_header_with_same_1_1 +>1_Long_header_with_same_1 AATTGCGCTAGCGCTAGGCGCTAGCGCGCTAGAGCCGCTAGGCGCCATTACGGCGCTATCCGCACGCGCATGCCACCGTTAG ->Long_header_with_same_2_2 +>2_Long_header_with_same_2 AACCGTTGGGGGGGGGGCCCCATTAGGCGCGGAATTTTCG ->Long_header_with_same_3_3 +>3_Long_header_with_same_3 ACGGCTCGCGGAGAGAGAGAGCTCGCGCA diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py index 2e782af10beca77c54dc041fec045d5813b3400b..b17924218c33dc9c60616fc9ea55411ac11eec08 100755 --- a/test/test_unit/test_annotate/test_genome_func.py +++ b/test/test_unit/test_annotate/test_genome_func.py @@ -40,14 +40,16 @@ def setup_teardown_module(): - remove directory with generated results """ # utils.init_logger(LOGFILE_BASE, 0, 'test_postalign', verbose=1) - os.mkdir(GENEPATH) + if os.path.isdir(GENEPATH): + content = os.listdir(GENEPATH) + for f in content: + assert f.startswith(".fuse") + else: + os.mkdir(GENEPATH) print("setup") yield - shutil.rmtree(GENEPATH) - # for f in LOGFILES: - # if os.path.exists(f): - # os.remove(f) + shutil.rmtree(GENEPATH, ignore_errors=True) print("teardown") @@ -195,7 +197,7 @@ def test_format_contig_cut(): cut = True pat = 'NNNNN+' cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG" - cur_contig_name = ">my_contig_name_for_my_sequence" + cur_contig_name = ">my_contig_name for_my_sequence" contig_sizes = {} resfile = os.path.join(GENEPATH, "test_format_cont_cut5N.fna") gresf = open(resfile, "w") @@ -208,8 +210,8 @@ def test_format_contig_cut(): exp_file = os.path.join(EXP_DIR, "exp_split_contig_cut3N.fna") assert os.path.exists(resfile) assert tutil.compare_order_content(resfile, exp_file) - assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 26, - ">my_contig_name_for_my_sequence_3\n": 25} + assert contig_sizes == {">2_my_contig_name for_my_sequence\n": 26, + ">3_my_contig_name for_my_sequence\n": 25} def test_format_contig_nocut(): @@ -220,23 +222,21 @@ def test_format_contig_nocut(): cut = False pat = None cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG" - cur_contig_name = ">my_contig_name_for_my_sequence" + cur_contig_name = ">my_contig_name_for_my_sequence\n" contig_sizes = {} resfile = os.path.join(GENEPATH, "test_format_cont_nocut_prokka.fna") - gresf = open(resfile, "w") + gresf = None num = 2 assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, - num, logger=None) == 3 - gresf.close() + num, logger=None) == 2 exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna") - assert os.path.exists(resfile) - assert tutil.compare_order_content(resfile, exp_file) - assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 56} + assert not os.path.exists(resfile) + assert contig_sizes == {">my_contig_name_for_my_sequence\n": 56} -def test_format_contig_nocut_prodigal_notSameName(): +def test_format_contig_nocut_notDuplicateName(): """ For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file (no need to split at 20 characters) @@ -262,7 +262,7 @@ def test_format_contig_nocut_prodigal_notSameName(): ">mycontig": 155} -def test_format_contig_nocut_prodigal_SameName(caplog): +def test_format_contig_nocut_DuplicateName(caplog): """ For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file. However, we must check that contig names are all different. Try to add a contig which name is already used, check that it prints the expected error, @@ -460,7 +460,7 @@ def test_analyse1genome_cut_same_names(): """ Analyse a genome. Its contig names all have the same first 20 characters. There is no stretch of at least 5N, so contigs are not split. - New contig names should be uniq, and not all ending with _0! + New contig names should be uniq, and not all starting with '1_'! """ genome = "genome_long_header.fst" genomes = {genome: ["SAEN.1015.0117"]}