Skip to content
Snippets Groups Projects
Commit f0c6c841 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Check uniqueness of contig names in prepare module

parent 83d961bc
No related branches found
No related tags found
No related merge requests found
Pipeline #40910 passed
......@@ -188,7 +188,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger):
if line.startswith(">"):
# If not first contig, write info to output file (if needed)
if cur_seq != "":
num = format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes,
num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes,
gresf, num, logger)
# If problem while formatting contig, return False -> genome ignored
if num == -1:
......@@ -205,7 +205,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger):
# LAST CONTIG
if cur_contig_name != "":
num = format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf,
num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf,
num, logger)
if num == -1:
return False
......@@ -274,7 +274,7 @@ def get_output_dir(soft, dbpath, tmp_path, genome, cut, pat):
return gpath, grespath
def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, logger):
def format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger):
"""
Format given contig, and save to output file if needed
......@@ -291,6 +291,8 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num,
current sequence (aa)
cur_contig_name : str
name of current contig
genome : str
name of current genome
cont_sizes : dict
{contig_name : sequence length}
gresf : io.TextIOWrappe
......@@ -313,9 +315,9 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num,
# No cut -> no new file created, but check contig unique names
else:
if cur_contig_name in contig_sizes.keys():
logger.error("{} contig name is used for several contigs. Please put "
"different names for each contig. This genome will be "
"ignored.".format(cur_contig_name))
logger.error(f"In genome {genome}, '{cur_contig_name}' contig name is used for "
"several contigs. Please put different names for each contig. "
"This genome will be ignored.")
return -1
else:
contig_sizes[cur_contig_name] = len(cur_seq)
......
......@@ -203,7 +203,7 @@ def test_format_contig_cut():
gresf = open(resfile, "w")
num = 2
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf,
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, gresf,
num, logger=None) == 4
gresf.close()
......@@ -228,7 +228,7 @@ def test_format_contig_nocut():
gresf = None
num = 2
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf,
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, gresf,
num, logger=None) == 2
exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna")
......@@ -252,9 +252,9 @@ def test_format_contig_nocut_notDuplicateName():
contig_sizes = {">mycontig": 155}
num = 1
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, None,
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, None,
num, logger=None) == 1
assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, contig_sizes, None,
assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, "genome", contig_sizes, None,
num, logger=None) == 1
assert contig_sizes == {">my_contig_name_for_my_sequence": 56,
......@@ -278,18 +278,19 @@ def test_format_contig_nocut_DuplicateName(caplog):
num = 1
# Try to add a contig already existing -> error log
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, None,
assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, None,
num, logger) == -1
assert contig_sizes == {">my_contig_name_for_my_sequence": 45,
">mycontig": 155}
# Check logs
caplog.set_level(logging.DEBUG)
assert (">my_contig_name_for_my_sequence contig name is used for several contigs. "
assert ("In genome genome, '>my_contig_name_for_my_sequence' contig name is used for "
"several contigs. "
"Please put different names for each contig. This genome will be "
"ignored.") in caplog.text
# Add a contig with new name. contig_sizes is completed
assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, contig_sizes, None,
assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, "genome", contig_sizes, None,
num, logger) == 1
assert contig_sizes == {">my_contig_name_for_my_sequence": 45,
">my_contig2": 17,
......@@ -487,7 +488,8 @@ def test_analyse1genome_same_names_nocut(caplog):
pat = None
assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes,
"prodigal", logger)
assert ("myheader contig name is used for several contigs. Please put different names for "
assert ("In genome genome_2_identical_headers.fst, '>myheader' contig name is used for "
"several contigs. Please put different names for "
"each contig. This genome will be ignored") in caplog.text
......@@ -503,7 +505,8 @@ def test_analyse1genome_same_last_name_nocut(caplog):
pat = None
assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes,
"prodigal", logger)
assert ("myheader contig name is used for several contigs. Please put different names for "
assert ("In genome genome_2_identical_headers-lastContig.fst, '>myheader' contig name "
"is used for several contigs. Please put different names for "
"each contig. This genome will be ignored") in caplog.text
......
......@@ -40,7 +40,7 @@ EXP_GENOMES = {
1587120, 1, 1]
}
LOGFILE_BASE = "logfile_test.txt"
LOGFILE_BASE = os.path.join(GENEPATH, "logfile_test.txt")
LEVEL = logging.DEBUG
LOGFILES = [LOGFILE_BASE + ext for ext in [".log", ".log.debug", ".log.details", ".log.err"]]
......@@ -58,7 +58,7 @@ def setup_teardown_module():
- remove all log files
- remove directory with generated results
"""
utils.init_logger(LOGFILE_BASE, LEVEL, 'test_filter', verbose=1)
# utils.init_logger(LOGFILE_BASE, LEVEL, 'test_filter', verbose=1)
os.mkdir(GENEPATH)
print("setup")
......@@ -283,6 +283,7 @@ def test_sketch_all(caplog):
"""
Test that all genomes are sketch, in the provided order
"""
utils.init_logger(LOGFILE_BASE, LEVEL, 'test_filter', verbose=1)
caplog.set_level(logging.DEBUG)
# We give 5 genomes
genomes = {"genome1": ["g1_name", "g1_ori", os.path.join(GENOMES_DIR, "ACOR001.0519.fna"),
......@@ -721,6 +722,65 @@ def test_check_quality_no_tmpdir(caplog):
assert "tmp_dir_check_quality does not exist" in caplog.text
def test_check_quality_same_contname(caplog):
"""
quality control of all genomes in the database when one genome contains 2 contigs
with the same name: this genome is discarded + warning message
"""
caplog.set_level(logging.DEBUG)
species_linked = "my-test-genomes"
db_path = os.path.join(DATA_TEST_DIR, "genomes", "genomes_comparison")
tmp_dir = os.path.join(GENEPATH, "tmp_dir_check_quality")
os.makedirs(tmp_dir)
# Copy all genomes to new database
db_path_used = os.path.join(GENEPATH, "database")
shutil.copytree(db_path, db_path_used)
# Add another genome, with 2 identical contig names
new_genome = os.path.join(db_path_used, "genome_duplicate.fst")
with open(new_genome, "w") as ng:
ng.write(">my_contig\nAACTATATAGGAAGACACACAATTAAGGGACAGG\n")
ng.write(">my_contig2\nCCNNCCGATTCGAGCACACAATTAAGGGACAGG\n")
ng.write(">my_contig\nCCANNCCATCTCTCTTATCTCTCTAANNNNCTCTANCCNNNNNNCCATTCA\n")
max_l90 = 100
max_cont = 100
cutn = 0
# Get genomes information
genomes = filterg.check_quality(species_linked, db_path_used, tmp_dir, max_l90, max_cont, cutn)
# Even if there was 1 more genome in the database, output list of genomes is the same,
# as the supplementary genome contains identical contig names
exp_genomes = {
"ACOR001.0519.fna": ["ACOR001.0519",
os.path.join(db_path_used, "ACOR001.0519.fna"),
os.path.join(db_path_used, "ACOR001.0519.fna"),
3013644, 269, 37],
"ACOR001.0519-bis.fna": ["ACOR001.0519-bis",
os.path.join(db_path_used, "ACOR001.0519-bis.fna"),
os.path.join(db_path_used, "ACOR001.0519-bis.fna"),
3013644, 269, 37],
"ACOR001.0519-almost-same.fna": ["ACOR001.0519-almost-same",
os.path.join(db_path_used, "ACOR001.0519-almost-same.fna"),
os.path.join(db_path_used, "ACOR001.0519-almost-same.fna"),
3012665, 261, 37],
"ACOR002.0519.fna": ["ACOR002.0519",
os.path.join(db_path_used, "ACOR002.0519.fna"),
os.path.join(db_path_used, "ACOR002.0519.fna"),
2997537, 78, 23],
"ACOC.1019.fna": ["ACOC.1019", os.path.join(db_path_used, "ACOC.1019.fna"),
os.path.join(db_path_used, "ACOC.1019.fna"),
1587120, 1, 1]
}
assert genomes == exp_genomes
# Check logs
assert ("Total number of genomes for my-test-genomes: 6") in caplog.text
assert ("In genome genome_duplicate.fst, '>my_contig' contig name is used for several "
"contigs. Please put different names for each contig. This genome will be "
"ignored.") in caplog.text
def test_check_quality_no_genome(caplog):
"""
quality control of all genomes in the database where there is no genome to annotate:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment