diff --git a/PanACoTA/annotate_module/genome_seq_functions.py b/PanACoTA/annotate_module/genome_seq_functions.py index 269f660251a727206e9baf50475fa08b86c661d6..e09697e5d1def7dd205ecb44634059a30369bdc8 100755 --- a/PanACoTA/annotate_module/genome_seq_functions.py +++ b/PanACoTA/annotate_module/genome_seq_functions.py @@ -188,7 +188,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger): if line.startswith(">"): # If not first contig, write info to output file (if needed) if cur_seq != "": - num = format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, + num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger) # If problem while formatting contig, return False -> genome ignored if num == -1: @@ -205,7 +205,7 @@ def analyse_genome(genome, dbpath, tmp_path, cut, pat, genomes, soft, logger): # LAST CONTIG if cur_contig_name != "": - num = format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, + num = format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger) if num == -1: return False @@ -274,7 +274,7 @@ def get_output_dir(soft, dbpath, tmp_path, genome, cut, pat): return gpath, grespath -def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, logger): +def format_contig(cut, pat, cur_seq, cur_contig_name, genome, contig_sizes, gresf, num, logger): """ Format given contig, and save to output file if needed @@ -291,6 +291,8 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, current sequence (aa) cur_contig_name : str name of current contig + genome : str + name of current genome cont_sizes : dict {contig_name : sequence length} gresf : io.TextIOWrappe @@ -313,9 +315,9 @@ def format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, num, # No cut -> no new file created, but check contig unique names else: if cur_contig_name in contig_sizes.keys(): - logger.error("{} contig name is used for several contigs. Please put " - "different names for each contig. This genome will be " - "ignored.".format(cur_contig_name)) + logger.error(f"In genome {genome}, '{cur_contig_name}' contig name is used for " + "several contigs. Please put different names for each contig. " + "This genome will be ignored.") return -1 else: contig_sizes[cur_contig_name] = len(cur_seq) diff --git a/test/test_unit/test_annotate/test_genome_func.py b/test/test_unit/test_annotate/test_genome_func.py index b17924218c33dc9c60616fc9ea55411ac11eec08..e362b732e76f9449d68baf14829e23084b2769b8 100755 --- a/test/test_unit/test_annotate/test_genome_func.py +++ b/test/test_unit/test_annotate/test_genome_func.py @@ -203,7 +203,7 @@ def test_format_contig_cut(): gresf = open(resfile, "w") num = 2 - assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, + assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, gresf, num, logger=None) == 4 gresf.close() @@ -228,7 +228,7 @@ def test_format_contig_nocut(): gresf = None num = 2 - assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, gresf, + assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, gresf, num, logger=None) == 2 exp_file = os.path.join(EXP_DIR, "exp_split_contig_nocut.fna") @@ -252,9 +252,9 @@ def test_format_contig_nocut_notDuplicateName(): contig_sizes = {">mycontig": 155} num = 1 - assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, None, + assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, None, num, logger=None) == 1 - assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, contig_sizes, None, + assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, "genome", contig_sizes, None, num, logger=None) == 1 assert contig_sizes == {">my_contig_name_for_my_sequence": 56, @@ -278,18 +278,19 @@ def test_format_contig_nocut_DuplicateName(caplog): num = 1 # Try to add a contig already existing -> error log - assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, contig_sizes, None, + assert gfunc.format_contig(cut, pat, cur_seq, cur_contig_name, "genome", contig_sizes, None, num, logger) == -1 assert contig_sizes == {">my_contig_name_for_my_sequence": 45, ">mycontig": 155} # Check logs caplog.set_level(logging.DEBUG) - assert (">my_contig_name_for_my_sequence contig name is used for several contigs. " + assert ("In genome genome, '>my_contig_name_for_my_sequence' contig name is used for " + "several contigs. " "Please put different names for each contig. This genome will be " "ignored.") in caplog.text # Add a contig with new name. contig_sizes is completed - assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, contig_sizes, None, + assert gfunc.format_contig(cut, pat, cur_seq2, cur_contig_name2, "genome", contig_sizes, None, num, logger) == 1 assert contig_sizes == {">my_contig_name_for_my_sequence": 45, ">my_contig2": 17, @@ -487,7 +488,8 @@ def test_analyse1genome_same_names_nocut(caplog): pat = None assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes, "prodigal", logger) - assert ("myheader contig name is used for several contigs. Please put different names for " + assert ("In genome genome_2_identical_headers.fst, '>myheader' contig name is used for " + "several contigs. Please put different names for " "each contig. This genome will be ignored") in caplog.text @@ -503,7 +505,8 @@ def test_analyse1genome_same_last_name_nocut(caplog): pat = None assert not gfunc.analyse_genome(genome, GEN_PATH, GENEPATH, cut, pat, genomes, "prodigal", logger) - assert ("myheader contig name is used for several contigs. Please put different names for " + assert ("In genome genome_2_identical_headers-lastContig.fst, '>myheader' contig name " + "is used for several contigs. Please put different names for " "each contig. This genome will be ignored") in caplog.text diff --git a/test/test_unit/test_prepare/test_filter.py b/test/test_unit/test_prepare/test_filter.py index 86d907fb252f355b1b9f3f172e2f220b87e194ed..d71669e0c54c45768f81193e2e0dbe8e9712d3dc 100755 --- a/test/test_unit/test_prepare/test_filter.py +++ b/test/test_unit/test_prepare/test_filter.py @@ -40,7 +40,7 @@ EXP_GENOMES = { 1587120, 1, 1] } -LOGFILE_BASE = "logfile_test.txt" +LOGFILE_BASE = os.path.join(GENEPATH, "logfile_test.txt") LEVEL = logging.DEBUG LOGFILES = [LOGFILE_BASE + ext for ext in [".log", ".log.debug", ".log.details", ".log.err"]] @@ -58,7 +58,7 @@ def setup_teardown_module(): - remove all log files - remove directory with generated results """ - utils.init_logger(LOGFILE_BASE, LEVEL, 'test_filter', verbose=1) + # utils.init_logger(LOGFILE_BASE, LEVEL, 'test_filter', verbose=1) os.mkdir(GENEPATH) print("setup") @@ -283,6 +283,7 @@ def test_sketch_all(caplog): """ Test that all genomes are sketch, in the provided order """ + utils.init_logger(LOGFILE_BASE, LEVEL, 'test_filter', verbose=1) caplog.set_level(logging.DEBUG) # We give 5 genomes genomes = {"genome1": ["g1_name", "g1_ori", os.path.join(GENOMES_DIR, "ACOR001.0519.fna"), @@ -721,6 +722,65 @@ def test_check_quality_no_tmpdir(caplog): assert "tmp_dir_check_quality does not exist" in caplog.text +def test_check_quality_same_contname(caplog): + """ + quality control of all genomes in the database when one genome contains 2 contigs + with the same name: this genome is discarded + warning message + """ + caplog.set_level(logging.DEBUG) + species_linked = "my-test-genomes" + db_path = os.path.join(DATA_TEST_DIR, "genomes", "genomes_comparison") + tmp_dir = os.path.join(GENEPATH, "tmp_dir_check_quality") + os.makedirs(tmp_dir) + # Copy all genomes to new database + db_path_used = os.path.join(GENEPATH, "database") + shutil.copytree(db_path, db_path_used) + # Add another genome, with 2 identical contig names + new_genome = os.path.join(db_path_used, "genome_duplicate.fst") + with open(new_genome, "w") as ng: + ng.write(">my_contig\nAACTATATAGGAAGACACACAATTAAGGGACAGG\n") + ng.write(">my_contig2\nCCNNCCGATTCGAGCACACAATTAAGGGACAGG\n") + ng.write(">my_contig\nCCANNCCATCTCTCTTATCTCTCTAANNNNCTCTANCCNNNNNNCCATTCA\n") + max_l90 = 100 + max_cont = 100 + cutn = 0 + + # Get genomes information + genomes = filterg.check_quality(species_linked, db_path_used, tmp_dir, max_l90, max_cont, cutn) + + # Even if there was 1 more genome in the database, output list of genomes is the same, + # as the supplementary genome contains identical contig names + exp_genomes = { + "ACOR001.0519.fna": ["ACOR001.0519", + os.path.join(db_path_used, "ACOR001.0519.fna"), + os.path.join(db_path_used, "ACOR001.0519.fna"), + 3013644, 269, 37], + "ACOR001.0519-bis.fna": ["ACOR001.0519-bis", + os.path.join(db_path_used, "ACOR001.0519-bis.fna"), + os.path.join(db_path_used, "ACOR001.0519-bis.fna"), + 3013644, 269, 37], + "ACOR001.0519-almost-same.fna": ["ACOR001.0519-almost-same", + os.path.join(db_path_used, "ACOR001.0519-almost-same.fna"), + os.path.join(db_path_used, "ACOR001.0519-almost-same.fna"), + 3012665, 261, 37], + "ACOR002.0519.fna": ["ACOR002.0519", + os.path.join(db_path_used, "ACOR002.0519.fna"), + os.path.join(db_path_used, "ACOR002.0519.fna"), + 2997537, 78, 23], + "ACOC.1019.fna": ["ACOC.1019", os.path.join(db_path_used, "ACOC.1019.fna"), + os.path.join(db_path_used, "ACOC.1019.fna"), + 1587120, 1, 1] + } + + assert genomes == exp_genomes + + # Check logs + assert ("Total number of genomes for my-test-genomes: 6") in caplog.text + assert ("In genome genome_duplicate.fst, '>my_contig' contig name is used for several " + "contigs. Please put different names for each contig. This genome will be " + "ignored.") in caplog.text + + def test_check_quality_no_genome(caplog): """ quality control of all genomes in the database where there is no genome to annotate: