Skip to content
Snippets Groups Projects
Commit c5ac2d6b authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Adapt test to new 'splitcontig format'

before, when we split contigs, we added '_num' at the end of the header. Now, it is at the begining
parent fb1b7b69
No related branches found
No related tags found
No related merge requests found
......@@ -310,16 +310,7 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num,
if cut:
# Cut sequence and write header + sequence to res file
num = split_contig(pat, cur_seq, cur_contig_name, contig_sizes, gresf, num)
# PROKKA User does not want to cut, but will annotate with prokka, so we still
# have to create a new sequence file
elif gresf:
new_contig_name = "{}_{}\n".format(cur_contig_name, num)
gresf.write(new_contig_name)
gresf.write(cur_seq + "\n")
contig_sizes[new_contig_name] = len(cur_seq)
num += 1
# PRODIGAL No cut, and prodigal used -> no new file created, but check
# contig unique names
# No cut -> no new file created, but check contig unique names
else:
if cur_contig_name in contig_sizes.keys():
logger.error("{} contig name is used for several contigs. Please put "
......
>contig1 dgfdgd_1
>1_contig1 dgfdgd
ACGTTGCTGC
>contig1 dgfdgd_2
>2_contig1 dgfdgd
AGCTGTCTAG
>contig2_3
>3_contig2
CGACGNNCGAG
>contig2_4
>4_contig2
AGGTG
>contig3_5
>5_contig3
ANNNNGGCTTGAGGTTGAA
>Long_header_with_same_1_1
>1_Long_header_with_same_1
AATTGCGCTAGCGCTAGGCGCTAGCGCGCTAGAGCCGCTAGGCGCCATTACGGCGCTATCCGCACGCGCATGCCACCGTTAG
>Long_header_with_same_2_2
>2_Long_header_with_same_2
AACCGTTGGGGGGGGGGCCCCATTAGGCGCGGAATTTTCG
>Long_header_with_same_3_3
>3_Long_header_with_same_3
ACGGCTCGCGGAGAGAGAGAGCTCGCGCA
......@@ -40,14 +40,16 @@ def setup_teardown_module():
- remove directory with generated results
"""
# utils.init_logger(LOGFILE_BASE, 0, 'test_postalign', verbose=1)
os.mkdir(GENEPATH)
if os.path.isdir(GENEPATH):
content = os.listdir(GENEPATH)
for f in content:
assert f.startswith(".fuse")
else:
os.mkdir(GENEPATH)
print("setup")
yield
shutil.rmtree(GENEPATH)
# for f in LOGFILES:
# if os.path.exists(f):
# os.remove(f)
shutil.rmtree(GENEPATH, ignore_errors=True)
print("teardown")
......@@ -195,7 +197,7 @@ def test_format_contig_cut():
cut = True
pat = 'NNNNN+'
cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
cur_contig_name = ">my_contig_name_for_my_sequence"
cur_contig_name = ">my_contig_name for_my_sequence"
contig_sizes = {}
resfile = os.path.join(GENEPATH, "test_format_cont_cut5N.fna")
gresf = open(resfile, "w")
......@@ -208,8 +210,8 @@ def test_format_contig_cut():
exp_file = os.path.join(EXP_DIR, "exp_split_contig_cut3N.fna")
assert os.path.exists(resfile)
assert tutil.compare_order_content(resfile, exp_file)
assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 26,
">my_contig_name_for_my_sequence_3\n": 25}
assert contig_sizes == {">2_my_contig_name for_my_sequence\n": 26,
">3_my_contig_name for_my_sequence\n": 25}
def test_format_contig_nocut():
......@@ -220,23 +222,21 @@ def test_format_contig_nocut():
cut = False
pat = None
cur_seq = "AACTGCTTTTTAAGCGCGCTCCTGCGNNNNNGGTTGTGTGGGCCCAGAGCGAGNCG"
cur_contig_name = ">my_contig_name_for_my_sequence"
cur_contig_name = ">my_contig_name_for_my_sequence\n"
contig_sizes = {}
resfile = os.path.join(GENEPATH, "test_format_cont_nocut_prokka.fna")
gresf = open(resfile, "w")
gresf = None
num = 2
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf,
num, logger=None) == 3
gresf.close()
num, logger=None) == 2
exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna")
assert os.path.exists(resfile)
assert tutil.compare_order_content(resfile, exp_file)
assert contig_sizes == {">my_contig_name_for_my_sequence_2\n": 56}
assert not os.path.exists(resfile)
assert contig_sizes == {">my_contig_name_for_my_sequence\n": 56}
def test_format_contig_nocut_prodigal_notSameName():
def test_format_contig_nocut_notDuplicateName():
"""
For a given contig, if we want to annotate it with prodigal, and do not cut,
then we keep the same file (no need to split at 20 characters)
......@@ -262,7 +262,7 @@ def test_format_contig_nocut_prodigal_notSameName():
">mycontig": 155}
def test_format_contig_nocut_prodigal_SameName(caplog):
def test_format_contig_nocut_DuplicateName(caplog):
"""
For a given contig, if we want to annotate it with prodigal, and do not cut, then we keep the same file. However, we must check that contig names are all different.
Try to add a contig which name is already used, check that it prints the expected error,
......@@ -460,7 +460,7 @@ def test_analyse1genome_cut_same_names():
"""
Analyse a genome. Its contig names all have the same first 20 characters. There is no
stretch of at least 5N, so contigs are not split.
New contig names should be uniq, and not all ending with _0!
New contig names should be uniq, and not all starting with '1_'!
"""
genome = "genome_long_header.fst"
genomes = {genome: ["SAEN.1015.0117"]}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment