From c5ac2d6b11b4bb6cf79505e2f21838bf6de87eb6 Mon Sep 17 00:00:00 2001 From: Amandine PERRIN <amandine.perrin@pasteur.fr> Date: Mon, 12 Oct 2020 12:29:07 +0200 Subject: [PATCH] Adapt test to new 'splitcontig format' before, when we split contigs, we added '_num' at the end of the header. Now, it is at the begining --- .../annotate_module/genome_seq_functions.py | 11 +----- .../annotate/exp_files/genome2-split5N.fna | 10 +++--- .../res_genome_short-long_header.fst | 6 ++-- .../test_annotate/test_genome_func.py | 36 +++++++++---------- 4 files changed, 27 insertions(+), 36 deletions(-) diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py index 6ec2bb81..dbb8a2b3 100755 --- a/PanACoTA/annotate_module/genome_seq_functions.py +++ b/PanACoTA/annotate_module/genome_seq_functions.py @@ -310,16 +310,7 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, if cut: # Cut sequence and write header + sequence to res file num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num) - # PROKKA User does not want to cut, but will annotate with prokka, so we still - # have to create a new sequence file - elif gresf: - new_contig_name = "{}_{}\n".format(cur_contig_name, num) - gresf.write(new_contig_name) - gresf.write(cur_seq + "\n") - contig_sizes[new_contig_name] = len(cur_seq) - num += 1 - # PRODIGAL No cut, and prodigal used -> no new file created, but check - # contig unique names + # No cut -> no new file created, but check contig unique names else: if cur_contig_name in contig_sizes.keys(): logger.error("{} contig name is used for several contigs. Please put " diff --git a/test/data/annotate/exp_files/genome2-split5N.fna b/test/data/annotate/exp_files/genome2-split5N.fna index e58ea892..f2053a5e 100644 --- a/test/data/annotate/exp_files/genome2-split5N.fna +++ b/test/data/annotate/exp_files/genome2-split5N.fna @@ -1,10 +1,10 @@ ->contig1 dgfdgd_1 +>1_contig1 dgfdgd ACGTTGCTGC ->contig1 dgfdgd_2 +>2_contig1 dgfdgd AGCTGTCTAG ->contig2_3 +>3_contig2 CGACGNNCGAG ->contig2_4 +>4_contig2 AGGTG ->contig3_5 +>5_contig3 ANNNNGGCTTGAGGTTGAA diff --git a/test/data/annotate/exp_files/res_genome_short-long_header.fst b/test/data/annotate/exp_files/res_genome_short-long_header.fst index 4a61e377..aaccf955 100755 --- a/test/data/annotate/exp_files/res_genome_short-long_header.fst +++ b/test/data/annotate/exp_files/res_genome_short-long_header.fst @@ -1,6 +1,6 @@ ->Long_header_with_same_1_1 +>1_Long_header_with_same_1 AATTGCGCTAGCGCTAGGCGCTAGCGCGCTAGAGCCGCTAGGCGCCATTACGGCGCTATCCGCACGCGCATGCCACCGTTAG ->Long_header_with_same_2_2 +>2_Long_header_with_same_2 AACCGTTGGGGGGGGGGCCCCATTAGGCGCGGAATTTTCG ->Long_header_with_same_3_3 +>3_Long_header_with_same_3 ACGGCTCGCGGAGAGAGAGAGCTCGCGCA diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py index 2e782af1..b1792421 100755 --- a/test/test_unit/test_annotate/test_genome_func.py +++ b/test/test_unit/test_annotate/test_genome_func.py @@ -40,14 +40,16 @@ def setup_teardown_module(): - remove directory with generated results """ # utils.init_logger(LOGFILE_BASE, 0, 'test_postalign', verbose=1) - os.mkdir(GENEPATH) + if os.path.isdir(GENEPATH): + content = os.listdir(GENEPATH) + for f in content: + assert f.startswith(".fuse") + else: + os.mkdir(GENEPATH) print("setup") yield - shutil.rmtree(GENEPATH) - # for f in LOGFILES: - # if os.path.exists(f): - # os.remove(f) + shutil.rmtree(GENEPATH, ignore_errors=True) print("teardown") @@ -195,7 +197,7 @@ def test_format_contig_cut(): cut = True pat = 'NNNNN+' cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG" - cur_contig_name = ">my_contig_name_for_my_sequence" + cur_contig_name = ">my_contig_name for_my_sequence" contig_sizes = {} resfile = os.path.join(GENEPATH, "test_format_cont_cut5N.fna") gresf = open(resfile, "w") @@ -208,8 +210,8 @@ def test_format_contig_cut(): exp_file = os.path.join(EXP_DIR, "exp_split_contig_cut3N.fna") assert os.path.exists(resfile) assert tutil.compare_order_content(resfile, exp_file) - assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 26, - ">my_contig_name_for_my_sequence_3\n": 25} + assert contig_sizes == {">2_my_contig_name for_my_sequence\n": 26, + ">3_my_contig_name for_my_sequence\n": 25} def test_format_contig_nocut(): @@ -220,23 +222,21 @@ def test_format_contig_nocut(): cut = False pat = None cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG" - cur_contig_name = ">my_contig_name_for_my_sequence" + cur_contig_name = ">my_contig_name_for_my_sequence\n" contig_sizes = {} resfile = os.path.join(GENEPATH, "test_format_cont_nocut_prokka.fna") - gresf = open(resfile, "w") + gresf = None num = 2 assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, - num, logger=None) == 3 - gresf.close() + num, logger=None) == 2 exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna") - assert os.path.exists(resfile) - assert tutil.compare_order_content(resfile, exp_file) - assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 56} + assert not os.path.exists(resfile) + assert contig_sizes == {">my_contig_name_for_my_sequence\n": 56} -def test_format_contig_nocut_prodigal_notSameName(): +def test_format_contig_nocut_notDuplicateName(): """ For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file (no need to split at 20 characters) @@ -262,7 +262,7 @@ def test_format_contig_nocut_prodigal_notSameName(): ">mycontig": 155} -def test_format_contig_nocut_prodigal_SameName(caplog): +def test_format_contig_nocut_DuplicateName(caplog): """ For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file. However, we must check that contig names are all different. Try to add a contig which name is already used, check that it prints the expected error, @@ -460,7 +460,7 @@ def test_analyse1genome_cut_same_names(): """ Analyse a genome. Its contig names all have the same first 20 characters. There is no stretch of at least 5N, so contigs are not split. - New contig names should be uniq, and not all ending with _0! + New contig names should be uniq, and not all starting with '1_'! """ genome = "genome_long_header.fst" genomes = {genome: ["SAEN.1015.0117"]} -- GitLab