diff --git a/PanACoTA/annotate_module/general_format_functions.py b/PanACoTA/annotate_module/general_format_functions.py index 1ec1caf1a987098585a3e43da9b9bb37d442be9c..fec882417f8f85d365118c6bc96a1857a00c516b 100644 --- a/PanACoTA/annotate_module/general_format_functions.py +++ b/PanACoTA/annotate_module/general_format_functions.py @@ -165,7 +165,6 @@ def handle_genome(args): format_one_genome = fprodigal.format_one_genome else: format_one_genome = fprokka.format_one_genome - # Set logger for this process qh = logging.handlers.QueueHandler(q) root = logging.getLogger() @@ -234,7 +233,7 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc, gene_name = "crispr" product = "crispr-array" crispr_num += 1 - locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc, + locus_name = "{}.{}{}_{}".format(genome, cont_loc, str(cont_num).zfill(4), str(locus_num).zfill(5)) # If '|' character found in those fields, replace by '_' to avoid problems while parsing more_info = "| {} | {} | {} | {}".format(product.replace("|", "_"), diff --git a/test/data/annotate/exp_files/res_test_write_geneCDS.lst b/test/data/annotate/exp_files/res_test_write_geneCDS.lst index fd07c103c6c41d1bddbe63a33596ceda9989b418..362776350d69975ba3fb5b548d8ec564acda0741 100755 --- a/test/data/annotate/exp_files/res_test_write_geneCDS.lst +++ b/test/data/annotate/exp_files/res_test_write_geneCDS.lst @@ -1 +1 @@ -154 656 C CDS ESCO.0216.00005.i0015_5621221 abc | new product | 454.12.5 | more information... dfd _ with _ pipe_characters... +154 656 C CDS ESCO.0216.00005.i0015_5621221 abc | new product | 454.12.5 | more information... dfd _ with _ pipe_characters... | mydb_pipe diff --git a/test/data/annotate/exp_files/res_test_write_geneCRISPR.lst b/test/data/annotate/exp_files/res_test_write_geneCRISPR.lst index 8fc5046e9f80b838c3c44dde7f62242ddaa311e7..47336e114ee3c8a921d29a7a17317e131c04a185 100755 --- a/test/data/annotate/exp_files/res_test_write_geneCRISPR.lst +++ b/test/data/annotate/exp_files/res_test_write_geneCRISPR.lst @@ -1 +1 @@ -154 656 D CRISPR ESCO.0216.00005.b0015_CRISPR1 crispr | crispr-array | NA | more information... dfd _ with _ pipe_characters... +154 656 D CRISPR ESCO.0216.00005.b0015_CRISPR1 crispr | crispr-array | NA | more information... dfd _ with _ pipe_characters... | mydb_pipe diff --git a/test/test_unit/test_annotate/test_format_func.py b/test/test_unit/test_annotate/test_format_func.py index b4ecbaf30cb88ab576650dd0e0ed5777fcec8a3f..c4e1bafc45828bd32d28daf69d35f3d6e5d39434 100755 --- a/test/test_unit/test_annotate/test_format_func.py +++ b/test/test_unit/test_annotate/test_format_func.py @@ -2,14 +2,14 @@ # coding: utf-8 """ -Unit tests for format_functions.py +Unit tests for general_format_functions.py """ import os import logging import shutil from io import StringIO -import PanACoTA.annotate_module.format_functions as ffunc +import PanACoTA.annotate_module.general_format_functions as ffunc import PanACoTA.utils as utils @@ -22,7 +22,7 @@ def my_logger(): m = multiprocessing.Manager() q = m.Queue() qh = logging.handlers.QueueHandler(q) - root = logging.getLogger() + root = logging.getLogger('test_gene-format') root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") @@ -46,20 +46,24 @@ def test_write_gene(): cont_num = 15 ecnum = "454.12.5" inf2 = "more information... dfd | with | pipe|characters..." + db_xref = "mydb|pipe" strand = "C" - start = 154 - end = 656 + start = str(154) + end = str(656) lstfile = "toto.lst" lstopenfile = open(lstfile, "w") - res = ffunc.write_gene(gtype, locus_num, gene_name, product, crispr_num, - cont_loc, genome, cont_num, ecnum, inf2, strand, - start, end, lstopenfile) + crispres, lst_line = ffunc.write_gene(gtype, locus_num, gene_name, product, crispr_num, + cont_loc, genome, cont_num, ecnum, inf2, db_xref, strand, + start, end, lstopenfile) lstopenfile.close() exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_test_write_geneCDS.lst") - assert res == crispr_num + assert crispres == crispr_num with open(exp_file, "r") as expf, open(lstfile, "r") as lstf: for line_exp, line_out in zip(expf, lstf): assert line_exp == line_out + assert lst_line == ("154\t656\tC\tCDS\tESCO.0216.00005.i0015_5621221\tabc\t| new product " + "| 454.12.5 | more information... dfd _ with _ pipe_characters... | " + "mydb_pipe") os.remove(lstfile) @@ -78,1107 +82,1085 @@ def test_write_crispr(): cont_num = 15 ecnum = "NA" inf2 = "more information... dfd | with | pipe|characters..." + db_xref = "mydb|pipe" strand = "D" - start = 154 - end = 656 + start = str(154) + end = str(656) lstfile = "toto.lst" lstopenfile = open(lstfile, "w") - res = ffunc.write_gene(gtype, locus_num, gene_name, product, crispr_num, - cont_loc, genome, cont_num, ecnum, inf2, strand, - start, end, lstopenfile) + crispres, lstline = ffunc.write_gene(gtype, locus_num, gene_name, product, crispr_num, + cont_loc, genome, cont_num, ecnum, inf2, db_xref, strand, + start, end, lstopenfile) lstopenfile.close() exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_test_write_geneCRISPR.lst") - assert res == 2 + assert crispres == 2 with open(exp_file, "r") as expf, open(lstfile, "r") as lstf: for line_exp, line_out in zip(expf, lstf): + print(f"\n{line_out}\n{line_exp}") assert line_exp == line_out + assert lstline == ("154\t656\tD\tCRISPR\tESCO.0216.00005.b0015_CRISPR1\tcrispr\t| " + "crispr-array | NA | more information... dfd _ with _ pipe_characters... | " + "mydb_pipe") os.remove(lstfile) -def test_tbl_to_lst(): - """ - Check that generated lstinfo file is as expected, when the genome name is the same as - it already was in the genome given to prokka. - The test tblfile contains the following aspects: - - gene in D strand (start < end) - - gene in C strand (start > end) - - CDS features (some with all info = ECnumber, gene name, product etc. ; - some with missing info) - - tRNA type - - repeat_region type (*2) - - contigs with more than 2 genes - - contig with only 2 genes (both 'b' loc) - - contig with 1 gene ('b' loc) - - contig without gene (should be skipped) - """ - tblfile = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", - "prokka_out_for_test.tbl") - lstfile = os.path.join("test", "data", "annotate", "test_tbl2lst.lst") - exp_lst = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - name = "test.0417.00002" - assert ffunc.tbl2lst(tblfile, lstfile, name) - with open(exp_lst, "r") as expf, open(lstfile, "r") as lstf: - for line_exp, line_out in zip(expf, lstf): - assert line_exp == line_out - os.remove(lstfile) - - -def test_tbl_to_lst_new_name(): - """ - Check that generated lstinfo file is as expected, when the genome name has changed between - the one given to prokka, and the name given now. - The test tblfile contains the following aspects: - - gene in D strand (start < end) - - gene in C strand (start > end) - - CDS features (some with all info = ECnumber, gene name, product etc. ; - some with missing info) - - tRNA type - - repeat_region type (*2) - - contigs with more than 2 genes - - contig with only 2 genes (both 'b' loc) - - contig with 1 gene ('b' loc) - - contig without gene (should be skipped) - """ - tblfile = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", - "prokka_out_for_test.tbl") - lstfile = os.path.join("test", "data", "annotate", "test_tbl2lstNewName.lst") - exp_lst = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst-newName.lst") - name = "test.0417.00010" - assert not ffunc.tbl2lst(tblfile, lstfile, name) - with open(exp_lst, "r") as expf, open(lstfile, "r") as lstf: - for line_exp, line_out in zip(expf, lstf): - assert line_exp == line_out - os.remove(lstfile) - - -def test_write_header_gene(): - """ - From a given line of lstinfo file, giving info for a gene (start, end, gene name, - product, EC number, more information), check that the header line of the protein and - gene files are generated as expected. - """ - outfile = StringIO() - lstline = ("4416\t6068\tD\tCDS\ttest.0417.00002.i0001_00005\tyiaD\t| " - "putative lipoprotein YiaD | 6.3.2.- | similar to AA sequence:UniProtKB:P37665") - ffunc.write_header(lstline, outfile) - res = outfile.getvalue() - exp = (">test.0417.00002.i0001_00005 1653 yiaD | putative lipoprotein YiaD | 6.3.2.- " - "| similar to AA sequence:UniProtKB:P37665\n") - assert res == exp - outfile.close() - - -def test_write_header_gene_no_name(): - """ - From a given line of lstinfo file, giving info for a gene with many unknown parts (gene - name, product, EC number and more information are NAs), check that the header line of the - protein and gene files are generated as expected. - """ - outfile = StringIO() - lstline = ("4632\t5000\tC\tCDS\ttest.0417.00002.b0002_00011\tNA\t| hypothetical protein " - "| NA | NA") - ffunc.write_header(lstline, outfile) - res = outfile.getvalue() - exp = ">test.0417.00002.b0002_00011 369 NA | hypothetical protein | NA | NA\n" - assert res == exp - outfile.close() - - -def test_write_header_crispr(): - """ - From a given line of lstinfo file, giving info for a CRISPR check that the header - line of the protein and gene files are generated as expected. - """ - outfile = StringIO() - lstline = ("296902\t2968265\tC\tCRISPR\ttest.0417.00002.b0003_CRISPR1\tcrispr\t| " - "crispr-array | NA | NA") - ffunc.write_header(lstline, outfile) - res = outfile.getvalue() - exp = ">test.0417.00002.b0003_CRISPR1 2671364 crispr | crispr-array | NA | NA\n" - assert res == exp - outfile.close() - - -def test_create_prt_wrong_header_sep(): - """ - Test that, when creating prt file from faa and lst, if a header of faa file is - not in the right format (protein name and number are not separated by '_'), - it writes an error, erases prt file, and returns False. - """ - logger = my_logger() - faaseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-wrongHeaderSep.faa") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - prtseq = os.path.join("test", "data", "annotate", "test_create_prt-wrongHeadSep.prt") - assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) - assert not os.path.isfile(prtseq) - msg = ("Unknown header format >JGIKIPIJ00008 in test/data/annotate/test_files/" - "prokka_out_for_test-wrongHeaderSep.faa. Error: invalid literal for int() " - "with base 10: '>JGIKIPIJ00008'\nPrt file not created from " - "test/data/annotate/test_files/prokka_out_for_test-wrongHeaderSep.faa.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_prt_wrong_header_int(): - """ - Test that, when creating prt file from faa and lst, if a header of faa file is - not in the right format (protein name and number are separated by '_', but protein num - contains a letter), it writes an error, erases prt file, and returns False. - """ - logger = my_logger() - faaseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-wrongHeaderInt.faa") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - prtseq = os.path.join("test", "data", "annotate", "test_create_prt-wrongHeadInt.prt") - assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) - assert not os.path.isfile(prtseq) - msg = ("Unknown header format >JGIKIPIJ_d0008 in test/data/annotate/test_files/" - "prokka_out_for_test-wrongHeaderInt.faa. Error: invalid literal for int() " - "with base 10: 'd0008'\nPrt file not created from " - "test/data/annotate/test_files/prokka_out_for_test-wrongHeaderInt.faa.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_prt_miss_lst(): - """ - Test that, when creating prt file from faa and lst, if a protein of faa file is not present in - the lst file, it writes an error, removes the prt file, and returns False. - """ - logger = my_logger() - faaseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-supHeader.faa") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - prtseq = os.path.join("test", "data", "annotate", "test_create_prt-missLst.prt") - assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) - assert not os.path.isfile(prtseq) - msg = ("Missing info for protein >sup-prot_00012 in " - "test/data/annotate/exp_files/res_tbl2lst.lst. If it is " - "actually present in the lst file, check that proteins are ordered by " - "increasing number in both lst and faa files.\n" - "Prt file not created from test/data/annotate/test_files/" - "prokka_out_for_test-supHeader.faa.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_prt_wrong_order(): - """ - Test that, when creating prt file from faa and lst, if a protein of faa file is not in - increasing protein number, so that it does not correspond to the protein in the lstinfo file, - it writes an error, removes the prt file, and returns False. - """ - logger = my_logger() - faaseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-wrongOrder.faa") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - prtseq = os.path.join("test", "data", "annotate", "test_create_prt-wrongOrder.prt") - assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) - assert not os.path.isfile(prtseq) - msg = ("Missing info for protein >appears_after_13_00011 in " - "test/data/annotate/exp_files/res_tbl2lst.lst. If it is " - "actually present in the lst file, check that proteins are ordered by " - "increasing number in both lst and faa files.\n" - "Prt file not created from test/data/annotate/test_files/" - "prokka_out_for_test-wrongOrder.faa.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_prt_ok(): - """ - Test that when everything is ok in both faa and lst files, the prt file is - created as expected. - """ - logger = my_logger() - faaseq = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", - "prokka_out_for_test.faa") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - prtseq = os.path.join("test", "data", "annotate", "test_create_prt.prt") - assert ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) - exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_create_prt.faa") - with open(exp_file, "r") as expf, open(prtseq, "r") as prtf: - for line_exp, line_out in zip(expf, prtf): - assert line_exp == line_out - os.remove(prtseq) - - -def test_create_gen_sup_crispr(): - """ - Test that when there is a CRISPR in the ffn file, but not in lstinfo, - it generates an error, because the CRISPR ID does not correspond to the gene ID in lstinfo. - It should return False, write an error message, and remove the .gen file. - Moreover, the CRISPR ID is not in the same format as a gene ID, so the error should - be on the format. - """ - logger = my_logger() - ffnseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-supCRISPR.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen_supCRISPR.gen") - assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) - assert not os.path.isfile(genseq) - msg = ("Unknown header format >prokka_out_for_test in test/data/annotate/test_files/" - "prokka_out_for_test-supCRISPR.ffn.\nGen file will not be created.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_gen_sup_gene(): - """ - Test that, when creating gen file from ffn and lst, if a gene of ffn file is not present in - the lst file, it writes an error, removes the gen file, and returns False. - """ - logger = my_logger() - ffnseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-supGene.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - prtseq = os.path.join("test", "data", "annotate", "test_create_gen-supgene.prt") - assert not ffunc.create_gen(ffnseq, lstfile, prtseq, logger[1]) - assert not os.path.isfile(prtseq) - msg = ("Missing info for gene >sup_gene_00012 in test/data/annotate/exp_files/" - "res_tbl2lst.lst. If it is actually present " - "in the lst file, check that genes are ordered by increasing " - "number in both lst and ffn files.\nGen file not created" - " from test/data/annotate/test_files/prokka_out_for_test-supGene.ffn.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_gen_miss_crispr(): - """ - Test for situation where there are 2 CRISPRs in the lstinfo file, the first one - is not in the ffn file, while the second one is in the ffn file. - It should return an error message about CRISPR number not corresponding, as - the first CRISPR found in ffn (CRISPR 1) corresponds to CRISPR2 in lstinfo file. - Gene file should be removed, and the function should return False - """ - logger = my_logger() - ffnseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-missCRISPR.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen_missCRISPR.gen") - assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) - assert not os.path.isfile(genseq) - msg = ("Problem with CRISPR numbers in test/data/annotate/exp_files/res_tbl2lst.lst. " - "CRISPR >prokka_out_for_test in ffn is CRISPR num 1, whereas it is annotated " - "as CRISPR num 2 in lst file.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_gen_no_crispr_ffn(): - """ - Test that when the there is a CRISPR in the lst file, but not in the ffn file, - everything goes well, as in some versions of prokka (1.12)n CRISPRs are not in ffn while - they are specified in lst. Function should return True, and gene file created. - """ - ffnseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-noCRISPRffn.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen_noCRISPRffn.gen") - assert ffunc.create_gen(ffnseq, lstfile, genseq, my_logger()[1]) - assert os.path.isfile(genseq) - exp_file = os.path.join("test", "data", "annotate", "exp_files", - "res_create_gen_noCRISPRffn.gen") - with open(exp_file, "r") as expf, open(genseq, "r") as prtf: - for line_exp, line_out in zip(expf, prtf): - assert line_exp == line_out - os.remove(genseq) - - -def test_create_gen_wrong_header_sep(): - """ - Test that, when creating gen file from ffn and lst, if a header of ffn file is - not in the right format (gene name and number are not separated by '_'), - it writes an error, erases gen file, and returns False. - """ - logger = my_logger() - ffnseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-wrongFormat.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen_wrongHeadSep.gen") - assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) - assert not os.path.isfile(genseq) - msg = ("Unknown header format >JGIKIPIJ-00005 in test/data/annotate/test_files/" - "prokka_out_for_test-wrongFormat.ffn.\n" - "Gen file will not be created.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_gen_wrong_header_int(): - """ - Test that, when creating gen file from ffn and lst, if a header of ffn file is - not in the right format (gene name and number are separated by '_', but gene num - contains a letter), it writes an error, erases gen file, and returns False. - """ - logger = my_logger() - ffnseq = os.path.join("test", "data", "annotate", "test_files", - "prokka_out_for_test-wrongInt.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen_wrongHeadInt.gen") - assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) - assert not os.path.isfile(genseq) - msg = ("Unknown header format >JGIKIPIJ_a00005 in test/data/annotate/test_files/" - "prokka_out_for_test-wrongInt.ffn.\n" - "Gen file will not be created.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_gen_wrong_lst_int(): - """ - Test that, when creating gen file from ffn and lst, if a gene name in lst file is - not in the right format (gene name and number are separated by '_', but gene num - contains a letter), it writes an error, erases gen file, and returns False. - Because the gene name in ffn won't be found in lst (a it contains an error in lst). - """ - logger = my_logger() - ffnseq = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", - "prokka_out_for_test.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst-wrongGeneName.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen_wrongLstHeadInt.gen") - assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) - assert not os.path.isfile(genseq) - msg = ("Missing info for gene >JGIKIPIJ_00009 in test/data/annotate/exp_files/" - "res_tbl2lst-wrongGeneName.lst. If it is actually present " - "in the lst file, check that genes are ordered by increasing " - "number in both lst and ffn files.\nGen file not created " - "from test/data/annotate/test_files/original_name.fna-prokkaRes/" - "prokka_out_for_test.ffn.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - - -def test_create_gen_ok(): - """ - Test that when everything is ok in both ffn and lst files, the gen file is - created as expected. - """ - faaseq = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", - "prokka_out_for_test.ffn") - lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") - genseq = os.path.join("test", "data", "annotate", "test_create_gen.gen") - assert ffunc.create_gen(faaseq, lstfile, genseq, my_logger()[1]) - exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_create_gen.gen") - with open(exp_file, "r") as expf, open(genseq, "r") as prtf: - for line_exp, line_out in zip(expf, prtf): - assert line_exp == line_out - os.remove(genseq) - - -def test_handle_line_gff(): - """ - Check that given a line in lstinfo with no information (gene name, ecnumber, product, - inference), it returns the good gff line - """ - line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" - "| NA | NA | NA") - line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" - "ID=ONKACNIE_00001;inference=ab initio " - "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") - outgff = "test_handle_line_gff.gff" - gfff = open(outgff, "w") - ffunc.handle_line_gff(line_lst, line_gff, gfff) - gfff.close() - with open(outgff, "r") as gf: - lines = gf.readlines() - assert len(lines) == 1 - exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" - "ID=ESCO.1015.00001.b0001_00001;locus_tag=ESCO.1015.00001.b0001_00001\n") - assert lines[0] == exp_line - os.remove(outgff) - - -def test_handle_line_gff_ecnum(): - """ - Check that given a line in lstinfo with only ec_number information, - it returns the good gff line - """ - line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" - "| NA | 123.995.4546 | NA") - line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" - "ID=ONKACNIE_00001;inference=ab initio " - "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") - outgff = "test_handle_line_gff-ecnum.gff" - gfff = open(outgff, "w") - ffunc.handle_line_gff(line_lst, line_gff, gfff) - gfff.close() - with open(outgff, "r") as gf: - lines = gf.readlines() - assert len(lines) == 1 - exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" - "ID=ESCO.1015.00001.b0001_00001;eC_number=123.995.4546;" - "locus_tag=ESCO.1015.00001.b0001_00001\n") - assert lines[0] == exp_line - os.remove(outgff) - - -def test_handle_line_gff_gene(): - """ - Check that given a line in lstinfo with only gene name information, - it returns the good gff line - """ - line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tge4a\t" - "| NA | NA | NA") - line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" - "ID=ONKACNIE_00001;inference=ab initio " - "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") - outgff = "test_handle_line_gff-gene.gff" - gfff = open(outgff, "w") - ffunc.handle_line_gff(line_lst, line_gff, gfff) - gfff.close() - with open(outgff, "r") as gf: - lines = gf.readlines() - assert len(lines) == 1 - exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" - "ID=ESCO.1015.00001.b0001_00001;Name=ge4a;gene=ge4a;" - "locus_tag=ESCO.1015.00001.b0001_00001\n") - assert lines[0] == exp_line - os.remove(outgff) - - -def test_handle_line_gff_inf(): - """ - Check that given a line in lstinfo with only inference information, - it returns the good gff line - """ - line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" - "| NA | NA | ab initio prediction:Prodigal:2.6") - line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" - "ID=ONKACNIE_00001;inference=ab initio " - "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") - outgff = "test_handle_line_gff-inf.gff" - gfff = open(outgff, "w") - ffunc.handle_line_gff(line_lst, line_gff, gfff) - gfff.close() - with open(outgff, "r") as gf: - lines = gf.readlines() - assert len(lines) == 1 - exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" - "ID=ESCO.1015.00001.b0001_00001;inference=ab initio " - "prediction:Prodigal:2.6;locus_tag=ESCO.1015.00001.b0001_00001\n") - assert lines[0] == exp_line - os.remove(outgff) - - -def test_handle_line_gff_prod(): - """ - Check that given a line in lstinfo with only product information, - it returns the good gff line - """ - line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" - "| hypothetical protein | NA | NA") - line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" - "ID=ONKACNIE_00001;inference=ab initio " - "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") - outgff = "test_handle_line_gff-prod.gff" - gfff = open(outgff, "w") - ffunc.handle_line_gff(line_lst, line_gff, gfff) - gfff.close() - with open(outgff, "r") as gf: - lines = gf.readlines() - assert len(lines) == 1 - exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" - "ID=ESCO.1015.00001.b0001_00001;locus_tag=ESCO.1015.00001.b0001_00001;" - "product=hypothetical protein\n") - assert lines[0] == exp_line - os.remove(outgff) - - -def test_generate_gff(): - """ - Test creating gff file. - """ - prokgff = os.path.join("test", "data", "annotate", "test_files", "prokka_out_gff.gff") - gffout = os.path.join("test", "data", "annotate", "test_creategff.gff") - lstgenome = os.path.join("test", "data", "annotate", "test_files", "lstinfo_for_gff.lst") - assert ffunc.generate_gff(prokgff, gffout, lstgenome, my_logger()[1]) - exp_gff = os.path.join("test", "data", "annotate", "exp_files", "res_create_gff.gff") - with open(gffout, "r") as gffo, open(exp_gff, "r") as expf: - for line_exp, line_out in zip(expf, gffo): - assert line_exp == line_out - os.remove(gffout) - - -def test_generate_gff_error(): - """ - Test creating gff file. - """ - logger = my_logger() - prokgff = os.path.join("test", "data", "annotate", "test_files", "prokka_out_gff-error.gff") - lstgenome = os.path.join("test", "data", "annotate", "test_files", "lstinfo_for_gff.lst") - gffout = os.path.join("test", "data", "annotate", "test_creategff.gff") - assert not ffunc.generate_gff(prokgff, gffout, lstgenome, logger[1]) - os.remove(gffout) - q = logger[0] - assert q.qsize() == 1 - logfound = q.get() - msg = ("Problem with your gff file. '##FASTA' is not a gff entry line, whereas it should " - "correspond to '863\t1795\tD\tCDS\tESCO.1015.00001.b0003_00016\tNA\t" - "| hypothetical protein | NA | NA'") - assert msg in logfound.message - assert logfound.levelname == "ERROR" - - -def test_handle_genome_nores(): - """ - Test that when we try to format a genome which is not in results, - it returns a tuple with "no_res" and the genome name. - """ - results = {"abcd.fasta": True} - args = ("toto.fasta", "name", "genome/path", "prokka/path", "lst/dir", "prot/dir", - "gene/dir", "rep/dir", "gff/dir", results, my_logger()[0]) - res = ffunc.handle_genome(args) - assert res == ("no_res", "toto.fasta") - - -def test_handle_genome_badprok(): - """ - Test that when we try to format a genome which is in results, but with False, - it returns a tuple with "bad_prokka" and the genome name. - """ - results = {"abcd.fasta": True, "toto.fasta": False} - args = ("toto.fasta", "name", "genome/path", "prokka/path", "lst/dir", "prot/dir", - "gene/dir", "rep/dir", "gff/dir", results, my_logger()[0]) - res = ffunc.handle_genome(args) - assert res == ("bad_prokka", "toto.fasta") - - -def test_handle_genome_formatok(): - """ - Test that when we try to format a genome which is in results, with True, - it returns a tuple with "True" and the genome name. - """ - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-split5N.fna-short-contig.fna") - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - results = {"B2_A3_5.fasta-split5N.fna-short-contig.fna": True, "toto.fasta": False} - args = ("B2_A3_5.fasta-split5N.fna-short-contig.fna", name, gpath, prok_path, - lst_dir, prot_dir, - gene_dir, rep_dir, gff_dir, results, my_logger()[0]) - res = ffunc.handle_genome(args) - assert res == (True, "B2_A3_5.fasta-split5N.fna-short-contig.fna") - os.remove(os.path.join(lst_dir, name + ".prt")) - os.remove(os.path.join(lst_dir, name + ".fna")) - os.remove(os.path.join(lst_dir, name + ".gen")) - os.remove(os.path.join(lst_dir, name + ".lst")) - os.remove(os.path.join(lst_dir, name + ".gff")) - - -def test_handle_genome_formaterror(): - """ - Test that when we try to format a genome which is in results, but with False, - it returns a tuple with "bad_prokka" and the genome name. - """ - logger = my_logger() - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-problems.fna-short-contig.fna") - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".tbl") - tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".tbl") - shutil.copyfile(tbl_init, tblout) - gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".gff") - gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".gff") - shutil.copyfile(gff_init, gffout) - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - results = {"B2_A3_5.fasta-problems.fna-short-contig.fna": True, "toto.fasta": False} - args = ("B2_A3_5.fasta-problems.fna-short-contig.fna", name, gpath, - prok_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir, results, logger[0]) - res = ffunc.handle_genome(args) - assert res == (False, "B2_A3_5.fasta-problems.fna-short-contig.fna") - msg = ("Unknown header format >EPKOMDHM_i00002 hypothetical protein in " - "test/data/annotate/exp_files/B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/" - "test.0417.00002.ffn.\n" - "Gen file will not be created.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - # remove tblout which was copied for this test - assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) - os.remove(tblout) - os.remove(gffout) - - -def test_format1genome(): - """ - Test that formatting a genome (making .prt, .gen, .fna, .lst) works, with a genome - which did not change name between prokka run and format step. - """ - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-split5N.fna-short-contig.fna") - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - assert ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, - gene_dir, rep_dir, gff_dir, my_logger()[1]) - # Check that all files were created - assert os.path.isfile(os.path.join(lst_dir, name + ".lst")) - assert os.path.isfile(os.path.join(lst_dir, name + ".fna")) - assert os.path.isfile(os.path.join(lst_dir, name + ".prt")) - assert os.path.isfile(os.path.join(lst_dir, name + ".gen")) - assert os.path.isfile(os.path.join(lst_dir, name + ".gff")) - # Check the contents of the files - explst = os.path.join(prok_path, "res_format-B2.lst") - expprt = os.path.join(prok_path, "res_format-B2.prt") - expgen = os.path.join(prok_path, "res_format-B2.gen") - expgff = os.path.join(prok_path, "res_format-B2.gff") - with open(explst, "r") as lstf, open(os.path.join(lst_dir, name + ".lst"), "r") as lsto: - for line_exp, line_out in zip(lstf, lsto): - assert line_exp == line_out - with open(expprt, "r") as expf, open(os.path.join(lst_dir, name + ".prt"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - with open(expgen, "r") as expf, open(os.path.join(lst_dir, name + ".gen"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - with open(gpath, "r") as expf, open(os.path.join(lst_dir, name + ".fna"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - with open(expgff, "r") as expf, open(os.path.join(lst_dir, name + ".gff"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - os.remove(os.path.join(lst_dir, name + ".lst")) - os.remove(os.path.join(lst_dir, name + ".prt")) - os.remove(os.path.join(lst_dir, name + ".fna")) - os.remove(os.path.join(lst_dir, name + ".gen")) - os.remove(os.path.join(lst_dir, name + ".gff")) - - -def test_format1genome_change_head(): - """ - Test that formatting a genome (making .prt, .gen, .fna, .lst) works, with a genome - which changed its name between prokka and format step. - """ - ginit = os.path.join("test", "data", "annotate", "genomes", "B2_A3_5.fasta-changeName.fna") - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-changeName.fna-short-contig.fna") - shutil.copyfile(ginit, gpath) - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - assert ffunc.format_one_genome(gpath, name, prok_path, lst_dir, - prot_dir, gene_dir, rep_dir, gff_dir, my_logger()[1]) - # Check that all files were created - assert os.path.isfile(os.path.join(lst_dir, name + ".lst")) - assert os.path.isfile(os.path.join(lst_dir, name + ".fna")) - assert os.path.isfile(os.path.join(lst_dir, name + ".prt")) - assert os.path.isfile(os.path.join(lst_dir, name + ".gen")) - assert os.path.isfile(os.path.join(lst_dir, name + ".gff")) - # Check the contents of the files - explst = os.path.join(prok_path, "res_format-B2.lst") - expprt = os.path.join(prok_path, "res_format-B2.prt") - expgen = os.path.join(prok_path, "res_format-B2.gen") - expgff = os.path.join(prok_path, "res_format-B2.gff") - expreplicons = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-split5N.fna-short-contig.fna") - with open(explst, "r") as lstf, open(os.path.join(lst_dir, name + ".lst"), "r") as lsto: - for line_exp, line_out in zip(lstf, lsto): - assert line_exp == line_out - with open(expprt, "r") as expf, open(os.path.join(lst_dir, name + ".prt"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - with open(expgen, "r") as expf, open(os.path.join(lst_dir, name + ".gen"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - with open(expgff, "r") as expf, open(os.path.join(lst_dir, name + ".gff"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - with open(expreplicons, "r") as expf, open(os.path.join(lst_dir, name + ".fna"), "r") as outf: - for line_exp, line_out in zip(expf, outf): - assert line_exp == line_out - os.remove(os.path.join(lst_dir, name + ".lst")) - os.remove(os.path.join(lst_dir, name + ".prt")) - os.remove(os.path.join(lst_dir, name + ".fna")) - os.remove(os.path.join(lst_dir, name + ".gen")) - os.remove(os.path.join(lst_dir, name + ".gff")) - os.remove(gpath) - -def test_format1genome_problemgen(): - """ - Test that formatting a genome (making .prt, .gen, .fna, .lst) returns an error message - and does not create any output file if there is a problem while converting the - .ffn to .gen - """ - logger = my_logger() - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-problems.fna-short-contig.fna") - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".tbl") - tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".tbl") - shutil.copyfile(tbl_init, tblout) - gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".gff") - gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".gff") - shutil.copyfile(gff_init, gffout) - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - assert not ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, - gene_dir, rep_dir, gff_dir, logger[1]) - # Check that all files were not created - assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) - msg = ("Unknown header format >EPKOMDHM_i00002 hypothetical protein in " - "test/data/annotate/exp_files/B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/" - "test.0417.00002.ffn.\n" - "Gen file will not be created.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - # remove tblout which was copied for this test - os.remove(tblout) - os.remove(gffout) - - -def test_format1genome_problemprt(): - """ - Test that formatting a genome (making .prt, .gen, .fna, .lst) works, with a genome - which did not change name between prokka run and format step. - """ - logger = my_logger() - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-problems.fna-short-contig.fna") - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - # copy tbl without errors to error prokka dir - tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".tbl") - tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".tbl") - shutil.copyfile(tbl_init, tblout) - # copy gff without errors to error prokka dir - gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".gff") - gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".gff") - shutil.copyfile(gff_init, gffout) - # copy ffn without error to error prokka dir - ffn_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".ffn") - ffn_ok = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".ffn") - ffn_error = ffn_ok + "-error" - # change name of ffn file with error to keep it for later (used for tests) - shutil.copyfile(ffn_ok, ffn_error) - # copy ffn without error to prokka res (erasing ffn with error) - shutil.copyfile(ffn_init, ffn_ok) - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - assert not ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, - gene_dir, rep_dir, gff_dir, logger[1]) - # Check that all files were not created - assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) - msg = ("Unknown header format >EPKOMDHM00003 hypothetical protein in " - "test/data/annotate/exp_files/B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/" - "test.0417.00002.faa. " - "Error: invalid literal for int() with base 10: '>EPKOMDHM00003'\n" - "Prt file not created from test/data/annotate/exp_files/" - "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/test.0417.00002.faa.") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - # remove files which were copied for this test (tblout). And rename ffn with errors - # to its original name. - os.rename(ffn_error, ffn_ok) - os.remove(tblout) - os.remove(gffout) - - -def test_format1genome_problemgff(): - """ - Test that formatting a genome (making .prt, .gen, .fna, .lst) returns an error message - and does not create any output file if there is a problem while converting the - .ffn to .gen - """ - logger = my_logger() - gpath = os.path.join("test", "data", "annotate", "genomes", - "B2_A3_5.fasta-problems.fna-short-contig.fna") - name = "test.0417.00002" - prok_path = os.path.join("test", "data", "annotate", "exp_files") - tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".tbl") - tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".tbl") - shutil.copyfile(tbl_init, tblout) - gff_init = os.path.join("test", "data", "annotate", "test_files", "prokka_out_gff-error.gff") - gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".gff") - shutil.copyfile(gff_init, gffout) - lst_dir = os.path.join("test", "data", "annotate") - prot_dir = lst_dir - gene_dir = lst_dir - rep_dir = lst_dir - gff_dir = lst_dir - assert not ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, - gene_dir, rep_dir, gff_dir, logger[1]) - # Check that all files were not created - assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) - assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) - msg = ("Problem with your gff file. '##FASTA' is not a gff entry line, whereas it should " - "correspond to '11249\t12328\tD\tCDS\ttest.0417.00002.i0002_00016\tNA\t" - "| hypothetical protein | NA | NA'") - q = logger[0] - assert q.qsize() == 1 - assert q.get().message == msg - # remove tblout which was copied for this test - os.remove(tblout) - os.remove(gffout) - - -def test_format_all(): - """ - Test that when giving a list of genomes, for which prokka ran without problem, - they are formatted, with all expected files created. - """ - # genomes = {genome: [name, gpath, size, nbcont, l90]} - initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] - initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] - gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-changeName.fna-short-contig.fna"] - onames = ["test_runprokka_H299", "test.0417.00002"] - gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] - for f1, f2 in zip(initpaths, gpaths): - shutil.copyfile(f1, f2) - genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], - gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] - } - prok_path = os.path.join("test", "data", "annotate", "exp_files") - res_path = os.path.join("test", "data", "annotate") - results = {gname: True for gname in gnames} - skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, - prok_path, threads=4) - assert skipped == [] - assert skipped_format == [] - lstfiles = [os.path.join(res_path, "LSTINFO", name + ".lst") for name in onames] - prtfiles = [os.path.join(res_path, "Proteins", name + ".prt") for name in onames] - genfiles = [os.path.join(res_path, "Genes", name + ".gen") for name in onames] - repfiles = [os.path.join(res_path, "Replicons", name + ".fna") for name in onames] - gfffiles = [os.path.join(res_path, "gff3", name + ".gff") for name in onames] - for f in lstfiles + prtfiles + genfiles + repfiles + gfffiles: - assert os.path.isfile(f) - shutil.rmtree(os.path.join(res_path, "LSTINFO")) - shutil.rmtree(os.path.join(res_path, "Proteins")) - shutil.rmtree(os.path.join(res_path, "Genes")) - shutil.rmtree(os.path.join(res_path, "Replicons")) - shutil.rmtree(os.path.join(res_path, "gff3")) - for f in gpaths: - os.remove(f) - - -def test_format_all_result_false(): - """ - Test that when giving a list of 2 genomes, 1 for which prokka ran without problem, - 1 for which prokka had problems (given with False in results), - the correct genome is formatted, with all - expected files created, and the genome with problems is not formatted. - """ - # genomes = {genome: [name, gpath, size, nbcont, l90]} - initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] - initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] - gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-changeName.fna-short-contig.fna"] - onames = ["test_runprokka_H299", "test.0417.00002"] - gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] - for f1, f2 in zip(initpaths, gpaths): - shutil.copyfile(f1, f2) - genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], - gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] - } - prok_path = os.path.join("test", "data", "annotate", "exp_files") - res_path = os.path.join("test", "data", "annotate") - results = {gnames[0]: True, gnames[1]: False} - skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, prok_path) - assert skipped == ["B2_A3_5.fasta-changeName.fna-short-contig.fna"] - assert skipped_format == [] - lstfiles = os.path.join(res_path, "LSTINFO") - prtfiles = os.path.join(res_path, "Proteins") - genfiles = os.path.join(res_path, "Genes") - repfiles = os.path.join(res_path, "Replicons") - gfffiles = os.path.join(res_path, "gff3") - assert os.path.isfile(os.path.join(lstfiles, onames[0] + ".lst")) - assert not os.path.isfile(os.path.join(lstfiles, onames[1] + ".lst")) - assert os.path.isfile(os.path.join(prtfiles, onames[0] + ".prt")) - assert not os.path.isfile(os.path.join(prtfiles, onames[1] + ".prt")) - assert os.path.isfile(os.path.join(genfiles, onames[0] + ".gen")) - assert not os.path.isfile(os.path.join(genfiles, onames[1] + ".gen")) - assert os.path.isfile(os.path.join(repfiles, onames[0] + ".fna")) - assert not os.path.isfile(os.path.join(repfiles, onames[1] + ".fna")) - assert os.path.isfile(os.path.join(gfffiles, onames[0] + ".gff")) - assert not os.path.isfile(os.path.join(gfffiles, onames[1] + ".gff")) - shutil.rmtree(os.path.join(res_path, "LSTINFO")) - shutil.rmtree(os.path.join(res_path, "Proteins")) - shutil.rmtree(os.path.join(res_path, "Genes")) - shutil.rmtree(os.path.join(res_path, "Replicons")) - shutil.rmtree(os.path.join(res_path, "gff3")) - for f in gpaths: - os.remove(f) - - -def test_format_all_not_result(): - """ - Test that when giving a list of 2 genomes, but only 1 is in the results list (and prokka ran - without problems for it), the correct genome is formatted, with all - expected files created, and the other genome is not formatted, and does not appear in - skipped list (as it was removed from the study before annotation step, probably by QC). - """ - # genomes = {genome: [name, gpath, size, nbcont, l90]} - initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] - initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] - gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-changeName.fna-short-contig.fna"] - onames = ["test_runprokka_H299", "test.0417.00002"] - gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] - for f1, f2 in zip(initpaths, gpaths): - shutil.copyfile(f1, f2) - genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], - gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] - } - prok_path = os.path.join("test", "data", "annotate", "exp_files") - res_path = os.path.join("test", "data", "annotate") - results = {gnames[0]: True} - skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, prok_path) - assert skipped == [] - assert skipped_format == [] - lstfiles = os.path.join(res_path, "LSTINFO") - prtfiles = os.path.join(res_path, "Proteins") - genfiles = os.path.join(res_path, "Genes") - repfiles = os.path.join(res_path, "Replicons") - gfffiles = os.path.join(res_path, "gff3") - assert os.path.isfile(os.path.join(lstfiles, onames[0] + ".lst")) - assert not os.path.isfile(os.path.join(lstfiles, onames[1] + ".lst")) - assert os.path.isfile(os.path.join(prtfiles, onames[0] + ".prt")) - assert not os.path.isfile(os.path.join(prtfiles, onames[1] + ".prt")) - assert os.path.isfile(os.path.join(genfiles, onames[0] + ".gen")) - assert not os.path.isfile(os.path.join(genfiles, onames[1] + ".gen")) - assert os.path.isfile(os.path.join(repfiles, onames[0] + ".fna")) - assert not os.path.isfile(os.path.join(repfiles, onames[1] + ".fna")) - assert os.path.isfile(os.path.join(gfffiles, onames[0] + ".gff")) - assert not os.path.isfile(os.path.join(gfffiles, onames[1] + ".gff")) - shutil.rmtree(os.path.join(res_path, "LSTINFO")) - shutil.rmtree(os.path.join(res_path, "Proteins")) - shutil.rmtree(os.path.join(res_path, "Genes")) - shutil.rmtree(os.path.join(res_path, "Replicons")) - shutil.rmtree(os.path.join(res_path, "gff3")) - for f in gpaths: - os.remove(f) - - # probleme avec .fna de onames[0] qui n'est pas créé... - - -def test_format_all_error(): - """ - Test that when giving a list of 2 genomes, prokka ran without problem for both. - But a problem appears while formatting the 2nd one. So, the 2nd one is not formatted, - and appears in skipped_format. The first one is formated, and check that all - output files are created. - """ - # genomes = {genome: [name, gpath, size, nbcont, l90]} - name = "test.0417.00002" - initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] - initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] - gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-problems.fna-short-contig.fna"] - onames = ["test_runprokka_H299", "test.0417.00002"] - gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] - for f1, f2 in zip(initpaths, gpaths): - shutil.copyfile(f1, f2) - genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], - gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] - } - prok_path = os.path.join("test", "data", "annotate", "exp_files") - res_path = os.path.join("test", "data", "annotate") - tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".tbl") - tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".tbl") - shutil.copyfile(tbl_init, tblout) - gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", - name + ".gff") - gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", - name + ".gff") - shutil.copyfile(gff_init, gffout) - results = {gnames[0]: True, gnames[1]: True} - skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, prok_path) - assert skipped == [] - assert skipped_format == ["B2_A3_5.fasta-problems.fna-short-contig.fna"] - lstfiles = os.path.join(res_path, "LSTINFO") - prtfiles = os.path.join(res_path, "Proteins") - genfiles = os.path.join(res_path, "Genes") - repfiles = os.path.join(res_path, "Replicons") - gfffiles = os.path.join(res_path, "gff3") - assert os.path.isfile(os.path.join(lstfiles, onames[0] + ".lst")) - assert not os.path.isfile(os.path.join(lstfiles, onames[1] + ".lst")) - assert os.path.isfile(os.path.join(prtfiles, onames[0] + ".prt")) - assert not os.path.isfile(os.path.join(prtfiles, onames[1] + ".prt")) - assert os.path.isfile(os.path.join(genfiles, onames[0] + ".gen")) - assert not os.path.isfile(os.path.join(genfiles, onames[1] + ".gen")) - assert os.path.isfile(os.path.join(repfiles, onames[0] + ".fna")) - assert not os.path.isfile(os.path.join(repfiles, onames[1] + ".fna")) - assert os.path.isfile(os.path.join(gfffiles, onames[0] + ".gff")) - assert not os.path.isfile(os.path.join(gfffiles, onames[1] + ".gff")) - shutil.rmtree(os.path.join(res_path, "LSTINFO")) - shutil.rmtree(os.path.join(res_path, "Proteins")) - shutil.rmtree(os.path.join(res_path, "Genes")) - shutil.rmtree(os.path.join(res_path, "Replicons")) - shutil.rmtree(os.path.join(res_path, "gff3")) - os.remove(tblout) - os.remove(gffout) - for f in gpaths: - os.remove(f) +# def test_tbl_to_lst_new_name(): +# """ +# Check that generated lstinfo file is as expected, when the genome name has changed between +# the one given to prokka, and the name given now. +# The test tblfile contains the following aspects: +# - gene in D strand (start < end) +# - gene in C strand (start > end) +# - CDS features (some with all info = ECnumber, gene name, product etc. ; +# some with missing info) +# - tRNA type +# - repeat_region type (*2) +# - contigs with more than 2 genes +# - contig with only 2 genes (both 'b' loc) +# - contig with 1 gene ('b' loc) +# - contig without gene (should be skipped) +# """ +# tblfile = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", +# "prokka_out_for_test.tbl") +# lstfile = os.path.join("test", "data", "annotate", "test_tbl2lstNewName.lst") +# exp_lst = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst-newName.lst") +# name = "test.0417.00010" +# assert not ffunc.tbl2lst(tblfile, lstfile, name) +# with open(exp_lst, "r") as expf, open(lstfile, "r") as lstf: +# for line_exp, line_out in zip(expf, lstf): +# assert line_exp == line_out +# os.remove(lstfile) + + +# def test_write_header_gene(): +# """ +# From a given line of lstinfo file, giving info for a gene (start, end, gene name, +# product, EC number, more information), check that the header line of the protein and +# gene files are generated as expected. +# """ +# outfile = StringIO() +# lstline = ("4416\t6068\tD\tCDS\ttest.0417.00002.i0001_00005\tyiaD\t| " +# "putative lipoprotein YiaD | 6.3.2.- | similar to AA sequence:UniProtKB:P37665") +# ffunc.write_header(lstline, outfile) +# res = outfile.getvalue() +# exp = (">test.0417.00002.i0001_00005 1653 yiaD | putative lipoprotein YiaD | 6.3.2.- " +# "| similar to AA sequence:UniProtKB:P37665\n") +# assert res == exp +# outfile.close() + + +# def test_write_header_gene_no_name(): +# """ +# From a given line of lstinfo file, giving info for a gene with many unknown parts (gene +# name, product, EC number and more information are NAs), check that the header line of the +# protein and gene files are generated as expected. +# """ +# outfile = StringIO() +# lstline = ("4632\t5000\tC\tCDS\ttest.0417.00002.b0002_00011\tNA\t| hypothetical protein " +# "| NA | NA") +# ffunc.write_header(lstline, outfile) +# res = outfile.getvalue() +# exp = ">test.0417.00002.b0002_00011 369 NA | hypothetical protein | NA | NA\n" +# assert res == exp +# outfile.close() + + +# def test_write_header_crispr(): +# """ +# From a given line of lstinfo file, giving info for a CRISPR check that the header +# line of the protein and gene files are generated as expected. +# """ +# outfile = StringIO() +# lstline = ("296902\t2968265\tC\tCRISPR\ttest.0417.00002.b0003_CRISPR1\tcrispr\t| " +# "crispr-array | NA | NA") +# ffunc.write_header(lstline, outfile) +# res = outfile.getvalue() +# exp = ">test.0417.00002.b0003_CRISPR1 2671364 crispr | crispr-array | NA | NA\n" +# assert res == exp +# outfile.close() + + +# def test_create_prt_wrong_header_sep(): +# """ +# Test that, when creating prt file from faa and lst, if a header of faa file is +# not in the right format (protein name and number are not separated by '_'), +# it writes an error, erases prt file, and returns False. +# """ +# logger = my_logger() +# faaseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-wrongHeaderSep.faa") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# prtseq = os.path.join("test", "data", "annotate", "test_create_prt-wrongHeadSep.prt") +# assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) +# assert not os.path.isfile(prtseq) +# msg = ("Unknown header format >JGIKIPIJ00008 in test/data/annotate/test_files/" +# "prokka_out_for_test-wrongHeaderSep.faa. Error: invalid literal for int() " +# "with base 10: '>JGIKIPIJ00008'\nPrt file not created from " +# "test/data/annotate/test_files/prokka_out_for_test-wrongHeaderSep.faa.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_prt_wrong_header_int(): +# """ +# Test that, when creating prt file from faa and lst, if a header of faa file is +# not in the right format (protein name and number are separated by '_', but protein num +# contains a letter), it writes an error, erases prt file, and returns False. +# """ +# logger = my_logger() +# faaseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-wrongHeaderInt.faa") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# prtseq = os.path.join("test", "data", "annotate", "test_create_prt-wrongHeadInt.prt") +# assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) +# assert not os.path.isfile(prtseq) +# msg = ("Unknown header format >JGIKIPIJ_d0008 in test/data/annotate/test_files/" +# "prokka_out_for_test-wrongHeaderInt.faa. Error: invalid literal for int() " +# "with base 10: 'd0008'\nPrt file not created from " +# "test/data/annotate/test_files/prokka_out_for_test-wrongHeaderInt.faa.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_prt_miss_lst(): +# """ +# Test that, when creating prt file from faa and lst, if a protein of faa file is not present in +# the lst file, it writes an error, removes the prt file, and returns False. +# """ +# logger = my_logger() +# faaseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-supHeader.faa") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# prtseq = os.path.join("test", "data", "annotate", "test_create_prt-missLst.prt") +# assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) +# assert not os.path.isfile(prtseq) +# msg = ("Missing info for protein >sup-prot_00012 in " +# "test/data/annotate/exp_files/res_tbl2lst.lst. If it is " +# "actually present in the lst file, check that proteins are ordered by " +# "increasing number in both lst and faa files.\n" +# "Prt file not created from test/data/annotate/test_files/" +# "prokka_out_for_test-supHeader.faa.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_prt_wrong_order(): +# """ +# Test that, when creating prt file from faa and lst, if a protein of faa file is not in +# increasing protein number, so that it does not correspond to the protein in the lstinfo file, +# it writes an error, removes the prt file, and returns False. +# """ +# logger = my_logger() +# faaseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-wrongOrder.faa") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# prtseq = os.path.join("test", "data", "annotate", "test_create_prt-wrongOrder.prt") +# assert not ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) +# assert not os.path.isfile(prtseq) +# msg = ("Missing info for protein >appears_after_13_00011 in " +# "test/data/annotate/exp_files/res_tbl2lst.lst. If it is " +# "actually present in the lst file, check that proteins are ordered by " +# "increasing number in both lst and faa files.\n" +# "Prt file not created from test/data/annotate/test_files/" +# "prokka_out_for_test-wrongOrder.faa.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_prt_ok(): +# """ +# Test that when everything is ok in both faa and lst files, the prt file is +# created as expected. +# """ +# logger = my_logger() +# faaseq = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", +# "prokka_out_for_test.faa") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# prtseq = os.path.join("test", "data", "annotate", "test_create_prt.prt") +# assert ffunc.create_prt(faaseq, lstfile, prtseq, logger[1]) +# exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_create_prt.faa") +# with open(exp_file, "r") as expf, open(prtseq, "r") as prtf: +# for line_exp, line_out in zip(expf, prtf): +# assert line_exp == line_out +# os.remove(prtseq) + + +# def test_create_gen_sup_crispr(): +# """ +# Test that when there is a CRISPR in the ffn file, but not in lstinfo, +# it generates an error, because the CRISPR ID does not correspond to the gene ID in lstinfo. +# It should return False, write an error message, and remove the .gen file. +# Moreover, the CRISPR ID is not in the same format as a gene ID, so the error should +# be on the format. +# """ +# logger = my_logger() +# ffnseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-supCRISPR.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen_supCRISPR.gen") +# assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) +# assert not os.path.isfile(genseq) +# msg = ("Unknown header format >prokka_out_for_test in test/data/annotate/test_files/" +# "prokka_out_for_test-supCRISPR.ffn.\nGen file will not be created.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_gen_sup_gene(): +# """ +# Test that, when creating gen file from ffn and lst, if a gene of ffn file is not present in +# the lst file, it writes an error, removes the gen file, and returns False. +# """ +# logger = my_logger() +# ffnseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-supGene.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# prtseq = os.path.join("test", "data", "annotate", "test_create_gen-supgene.prt") +# assert not ffunc.create_gen(ffnseq, lstfile, prtseq, logger[1]) +# assert not os.path.isfile(prtseq) +# msg = ("Missing info for gene >sup_gene_00012 in test/data/annotate/exp_files/" +# "res_tbl2lst.lst. If it is actually present " +# "in the lst file, check that genes are ordered by increasing " +# "number in both lst and ffn files.\nGen file not created" +# " from test/data/annotate/test_files/prokka_out_for_test-supGene.ffn.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_gen_miss_crispr(): +# """ +# Test for situation where there are 2 CRISPRs in the lstinfo file, the first one +# is not in the ffn file, while the second one is in the ffn file. +# It should return an error message about CRISPR number not corresponding, as +# the first CRISPR found in ffn (CRISPR 1) corresponds to CRISPR2 in lstinfo file. +# Gene file should be removed, and the function should return False +# """ +# logger = my_logger() +# ffnseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-missCRISPR.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen_missCRISPR.gen") +# assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) +# assert not os.path.isfile(genseq) +# msg = ("Problem with CRISPR numbers in test/data/annotate/exp_files/res_tbl2lst.lst. " +# "CRISPR >prokka_out_for_test in ffn is CRISPR num 1, whereas it is annotated " +# "as CRISPR num 2 in lst file.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_gen_no_crispr_ffn(): +# """ +# Test that when the there is a CRISPR in the lst file, but not in the ffn file, +# everything goes well, as in some versions of prokka (1.12)n CRISPRs are not in ffn while +# they are specified in lst. Function should return True, and gene file created. +# """ +# ffnseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-noCRISPRffn.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen_noCRISPRffn.gen") +# assert ffunc.create_gen(ffnseq, lstfile, genseq, my_logger()[1]) +# assert os.path.isfile(genseq) +# exp_file = os.path.join("test", "data", "annotate", "exp_files", +# "res_create_gen_noCRISPRffn.gen") +# with open(exp_file, "r") as expf, open(genseq, "r") as prtf: +# for line_exp, line_out in zip(expf, prtf): +# assert line_exp == line_out +# os.remove(genseq) + + +# def test_create_gen_wrong_header_sep(): +# """ +# Test that, when creating gen file from ffn and lst, if a header of ffn file is +# not in the right format (gene name and number are not separated by '_'), +# it writes an error, erases gen file, and returns False. +# """ +# logger = my_logger() +# ffnseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-wrongFormat.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen_wrongHeadSep.gen") +# assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) +# assert not os.path.isfile(genseq) +# msg = ("Unknown header format >JGIKIPIJ-00005 in test/data/annotate/test_files/" +# "prokka_out_for_test-wrongFormat.ffn.\n" +# "Gen file will not be created.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_gen_wrong_header_int(): +# """ +# Test that, when creating gen file from ffn and lst, if a header of ffn file is +# not in the right format (gene name and number are separated by '_', but gene num +# contains a letter), it writes an error, erases gen file, and returns False. +# """ +# logger = my_logger() +# ffnseq = os.path.join("test", "data", "annotate", "test_files", +# "prokka_out_for_test-wrongInt.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen_wrongHeadInt.gen") +# assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) +# assert not os.path.isfile(genseq) +# msg = ("Unknown header format >JGIKIPIJ_a00005 in test/data/annotate/test_files/" +# "prokka_out_for_test-wrongInt.ffn.\n" +# "Gen file will not be created.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_gen_wrong_lst_int(): +# """ +# Test that, when creating gen file from ffn and lst, if a gene name in lst file is +# not in the right format (gene name and number are separated by '_', but gene num +# contains a letter), it writes an error, erases gen file, and returns False. +# Because the gene name in ffn won't be found in lst (a it contains an error in lst). +# """ +# logger = my_logger() +# ffnseq = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", +# "prokka_out_for_test.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst-wrongGeneName.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen_wrongLstHeadInt.gen") +# assert not ffunc.create_gen(ffnseq, lstfile, genseq, logger[1]) +# assert not os.path.isfile(genseq) +# msg = ("Missing info for gene >JGIKIPIJ_00009 in test/data/annotate/exp_files/" +# "res_tbl2lst-wrongGeneName.lst. If it is actually present " +# "in the lst file, check that genes are ordered by increasing " +# "number in both lst and ffn files.\nGen file not created " +# "from test/data/annotate/test_files/original_name.fna-prokkaRes/" +# "prokka_out_for_test.ffn.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg + + +# def test_create_gen_ok(): +# """ +# Test that when everything is ok in both ffn and lst files, the gen file is +# created as expected. +# """ +# faaseq = os.path.join("test", "data", "annotate", "test_files", "original_name.fna-prokkaRes", +# "prokka_out_for_test.ffn") +# lstfile = os.path.join("test", "data", "annotate", "exp_files", "res_tbl2lst.lst") +# genseq = os.path.join("test", "data", "annotate", "test_create_gen.gen") +# assert ffunc.create_gen(faaseq, lstfile, genseq, my_logger()[1]) +# exp_file = os.path.join("test", "data", "annotate", "exp_files", "res_create_gen.gen") +# with open(exp_file, "r") as expf, open(genseq, "r") as prtf: +# for line_exp, line_out in zip(expf, prtf): +# assert line_exp == line_out +# os.remove(genseq) + + +# def test_handle_line_gff(): +# """ +# Check that given a line in lstinfo with no information (gene name, ecnumber, product, +# inference), it returns the good gff line +# """ +# line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" +# "| NA | NA | NA") +# line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" +# "ID=ONKACNIE_00001;inference=ab initio " +# "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") +# outgff = "test_handle_line_gff.gff" +# gfff = open(outgff, "w") +# ffunc.handle_line_gff(line_lst, line_gff, gfff) +# gfff.close() +# with open(outgff, "r") as gf: +# lines = gf.readlines() +# assert len(lines) == 1 +# exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" +# "ID=ESCO.1015.00001.b0001_00001;locus_tag=ESCO.1015.00001.b0001_00001\n") +# assert lines[0] == exp_line +# os.remove(outgff) + + +# def test_handle_line_gff_ecnum(): +# """ +# Check that given a line in lstinfo with only ec_number information, +# it returns the good gff line +# """ +# line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" +# "| NA | 123.995.4546 | NA") +# line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" +# "ID=ONKACNIE_00001;inference=ab initio " +# "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") +# outgff = "test_handle_line_gff-ecnum.gff" +# gfff = open(outgff, "w") +# ffunc.handle_line_gff(line_lst, line_gff, gfff) +# gfff.close() +# with open(outgff, "r") as gf: +# lines = gf.readlines() +# assert len(lines) == 1 +# exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" +# "ID=ESCO.1015.00001.b0001_00001;eC_number=123.995.4546;" +# "locus_tag=ESCO.1015.00001.b0001_00001\n") +# assert lines[0] == exp_line +# os.remove(outgff) + + +# def test_handle_line_gff_gene(): +# """ +# Check that given a line in lstinfo with only gene name information, +# it returns the good gff line +# """ +# line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tge4a\t" +# "| NA | NA | NA") +# line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" +# "ID=ONKACNIE_00001;inference=ab initio " +# "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") +# outgff = "test_handle_line_gff-gene.gff" +# gfff = open(outgff, "w") +# ffunc.handle_line_gff(line_lst, line_gff, gfff) +# gfff.close() +# with open(outgff, "r") as gf: +# lines = gf.readlines() +# assert len(lines) == 1 +# exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" +# "ID=ESCO.1015.00001.b0001_00001;Name=ge4a;gene=ge4a;" +# "locus_tag=ESCO.1015.00001.b0001_00001\n") +# assert lines[0] == exp_line +# os.remove(outgff) + + +# def test_handle_line_gff_inf(): +# """ +# Check that given a line in lstinfo with only inference information, +# it returns the good gff line +# """ +# line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" +# "| NA | NA | ab initio prediction:Prodigal:2.6") +# line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" +# "ID=ONKACNIE_00001;inference=ab initio " +# "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") +# outgff = "test_handle_line_gff-inf.gff" +# gfff = open(outgff, "w") +# ffunc.handle_line_gff(line_lst, line_gff, gfff) +# gfff.close() +# with open(outgff, "r") as gf: +# lines = gf.readlines() +# assert len(lines) == 1 +# exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" +# "ID=ESCO.1015.00001.b0001_00001;inference=ab initio " +# "prediction:Prodigal:2.6;locus_tag=ESCO.1015.00001.b0001_00001\n") +# assert lines[0] == exp_line +# os.remove(outgff) + + +# def test_handle_line_gff_prod(): +# """ +# Check that given a line in lstinfo with only product information, +# it returns the good gff line +# """ +# line_lst = ("201\t743\tC\tCDS\tESCO.1015.00001.b0001_00001\tNA\t" +# "| hypothetical protein | NA | NA") +# line_gff = ("H561_S27_L001__1\tProdigal:2.6\tCDS\t201\t743\t.\t-\t0\t" +# "ID=ONKACNIE_00001;inference=ab initio " +# "prediction:Prodigal:2.6;locus_tag=ONKACNIE_00001;product=hypothetical protein") +# outgff = "test_handle_line_gff-prod.gff" +# gfff = open(outgff, "w") +# ffunc.handle_line_gff(line_lst, line_gff, gfff) +# gfff.close() +# with open(outgff, "r") as gf: +# lines = gf.readlines() +# assert len(lines) == 1 +# exp_line = ("ESCO.1015.00001.0001\tProdigal:2.6\tCDS\t201\t743\t.\t-\t.\t" +# "ID=ESCO.1015.00001.b0001_00001;locus_tag=ESCO.1015.00001.b0001_00001;" +# "product=hypothetical protein\n") +# assert lines[0] == exp_line +# os.remove(outgff) + + +# def test_generate_gff(): +# """ +# Test creating gff file. +# """ +# prokgff = os.path.join("test", "data", "annotate", "test_files", "prokka_out_gff.gff") +# gffout = os.path.join("test", "data", "annotate", "test_creategff.gff") +# lstgenome = os.path.join("test", "data", "annotate", "test_files", "lstinfo_for_gff.lst") +# assert ffunc.generate_gff(prokgff, gffout, lstgenome, my_logger()[1]) +# exp_gff = os.path.join("test", "data", "annotate", "exp_files", "res_create_gff.gff") +# with open(gffout, "r") as gffo, open(exp_gff, "r") as expf: +# for line_exp, line_out in zip(expf, gffo): +# assert line_exp == line_out +# os.remove(gffout) + + +# def test_generate_gff_error(): +# """ +# Test creating gff file. +# """ +# logger = my_logger() +# prokgff = os.path.join("test", "data", "annotate", "test_files", "prokka_out_gff-error.gff") +# lstgenome = os.path.join("test", "data", "annotate", "test_files", "lstinfo_for_gff.lst") +# gffout = os.path.join("test", "data", "annotate", "test_creategff.gff") +# assert not ffunc.generate_gff(prokgff, gffout, lstgenome, logger[1]) +# os.remove(gffout) +# q = logger[0] +# assert q.qsize() == 1 +# logfound = q.get() +# msg = ("Problem with your gff file. '##FASTA' is not a gff entry line, whereas it should " +# "correspond to '863\t1795\tD\tCDS\tESCO.1015.00001.b0003_00016\tNA\t" +# "| hypothetical protein | NA | NA'") +# assert msg in logfound.message +# assert logfound.levelname == "ERROR" + + +# def test_handle_genome_nores(): +# """ +# Test that when we try to format a genome which is not in results, +# it returns a tuple with "no_res" and the genome name. +# """ +# results = {"abcd.fasta": True} +# args = ("toto.fasta", "name", "genome/path", "prokka/path", "lst/dir", "prot/dir", +# "gene/dir", "rep/dir", "gff/dir", results, my_logger()[0]) +# res = ffunc.handle_genome(args) +# assert res == ("no_res", "toto.fasta") + + +# def test_handle_genome_badprok(): +# """ +# Test that when we try to format a genome which is in results, but with False, +# it returns a tuple with "bad_prokka" and the genome name. +# """ +# results = {"abcd.fasta": True, "toto.fasta": False} +# args = ("toto.fasta", "name", "genome/path", "prokka/path", "lst/dir", "prot/dir", +# "gene/dir", "rep/dir", "gff/dir", results, my_logger()[0]) +# res = ffunc.handle_genome(args) +# assert res == ("bad_prokka", "toto.fasta") + + +# def test_handle_genome_formatok(): +# """ +# Test that when we try to format a genome which is in results, with True, +# it returns a tuple with "True" and the genome name. +# """ +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-split5N.fna-short-contig.fna") +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# results = {"B2_A3_5.fasta-split5N.fna-short-contig.fna": True, "toto.fasta": False} +# args = ("B2_A3_5.fasta-split5N.fna-short-contig.fna", name, gpath, prok_path, +# lst_dir, prot_dir, +# gene_dir, rep_dir, gff_dir, results, my_logger()[0]) +# res = ffunc.handle_genome(args) +# assert res == (True, "B2_A3_5.fasta-split5N.fna-short-contig.fna") +# os.remove(os.path.join(lst_dir, name + ".prt")) +# os.remove(os.path.join(lst_dir, name + ".fna")) +# os.remove(os.path.join(lst_dir, name + ".gen")) +# os.remove(os.path.join(lst_dir, name + ".lst")) +# os.remove(os.path.join(lst_dir, name + ".gff")) + + +# def test_handle_genome_formaterror(): +# """ +# Test that when we try to format a genome which is in results, but with False, +# it returns a tuple with "bad_prokka" and the genome name. +# """ +# logger = my_logger() +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-problems.fna-short-contig.fna") +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# shutil.copyfile(tbl_init, tblout) +# gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# shutil.copyfile(gff_init, gffout) +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# results = {"B2_A3_5.fasta-problems.fna-short-contig.fna": True, "toto.fasta": False} +# args = ("B2_A3_5.fasta-problems.fna-short-contig.fna", name, gpath, +# prok_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir, results, logger[0]) +# res = ffunc.handle_genome(args) +# assert res == (False, "B2_A3_5.fasta-problems.fna-short-contig.fna") +# msg = ("Unknown header format >EPKOMDHM_i00002 hypothetical protein in " +# "test/data/annotate/exp_files/B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/" +# "test.0417.00002.ffn.\n" +# "Gen file will not be created.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg +# # remove tblout which was copied for this test +# assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) +# os.remove(tblout) +# os.remove(gffout) + + +# def test_format1genome(): +# """ +# Test that formatting a genome (making .prt, .gen, .fna, .lst) works, with a genome +# which did not change name between prokka run and format step. +# """ +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-split5N.fna-short-contig.fna") +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# assert ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, +# gene_dir, rep_dir, gff_dir, my_logger()[1]) +# # Check that all files were created +# assert os.path.isfile(os.path.join(lst_dir, name + ".lst")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".fna")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".prt")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".gen")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".gff")) +# # Check the contents of the files +# explst = os.path.join(prok_path, "res_format-B2.lst") +# expprt = os.path.join(prok_path, "res_format-B2.prt") +# expgen = os.path.join(prok_path, "res_format-B2.gen") +# expgff = os.path.join(prok_path, "res_format-B2.gff") +# with open(explst, "r") as lstf, open(os.path.join(lst_dir, name + ".lst"), "r") as lsto: +# for line_exp, line_out in zip(lstf, lsto): +# assert line_exp == line_out +# with open(expprt, "r") as expf, open(os.path.join(lst_dir, name + ".prt"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# with open(expgen, "r") as expf, open(os.path.join(lst_dir, name + ".gen"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# with open(gpath, "r") as expf, open(os.path.join(lst_dir, name + ".fna"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# with open(expgff, "r") as expf, open(os.path.join(lst_dir, name + ".gff"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# os.remove(os.path.join(lst_dir, name + ".lst")) +# os.remove(os.path.join(lst_dir, name + ".prt")) +# os.remove(os.path.join(lst_dir, name + ".fna")) +# os.remove(os.path.join(lst_dir, name + ".gen")) +# os.remove(os.path.join(lst_dir, name + ".gff")) + + +# def test_format1genome_change_head(): +# """ +# Test that formatting a genome (making .prt, .gen, .fna, .lst) works, with a genome +# which changed its name between prokka and format step. +# """ +# ginit = os.path.join("test", "data", "annotate", "genomes", "B2_A3_5.fasta-changeName.fna") +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-changeName.fna-short-contig.fna") +# shutil.copyfile(ginit, gpath) +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# assert ffunc.format_one_genome(gpath, name, prok_path, lst_dir, +# prot_dir, gene_dir, rep_dir, gff_dir, my_logger()[1]) +# # Check that all files were created +# assert os.path.isfile(os.path.join(lst_dir, name + ".lst")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".fna")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".prt")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".gen")) +# assert os.path.isfile(os.path.join(lst_dir, name + ".gff")) +# # Check the contents of the files +# explst = os.path.join(prok_path, "res_format-B2.lst") +# expprt = os.path.join(prok_path, "res_format-B2.prt") +# expgen = os.path.join(prok_path, "res_format-B2.gen") +# expgff = os.path.join(prok_path, "res_format-B2.gff") +# expreplicons = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-split5N.fna-short-contig.fna") +# with open(explst, "r") as lstf, open(os.path.join(lst_dir, name + ".lst"), "r") as lsto: +# for line_exp, line_out in zip(lstf, lsto): +# assert line_exp == line_out +# with open(expprt, "r") as expf, open(os.path.join(lst_dir, name + ".prt"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# with open(expgen, "r") as expf, open(os.path.join(lst_dir, name + ".gen"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# with open(expgff, "r") as expf, open(os.path.join(lst_dir, name + ".gff"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# with open(expreplicons, "r") as expf, open(os.path.join(lst_dir, name + ".fna"), "r") as outf: +# for line_exp, line_out in zip(expf, outf): +# assert line_exp == line_out +# os.remove(os.path.join(lst_dir, name + ".lst")) +# os.remove(os.path.join(lst_dir, name + ".prt")) +# os.remove(os.path.join(lst_dir, name + ".fna")) +# os.remove(os.path.join(lst_dir, name + ".gen")) +# os.remove(os.path.join(lst_dir, name + ".gff")) +# os.remove(gpath) + + +# def test_format1genome_problemgen(): +# """ +# Test that formatting a genome (making .prt, .gen, .fna, .lst) returns an error message +# and does not create any output file if there is a problem while converting the +# .ffn to .gen +# """ +# logger = my_logger() +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-problems.fna-short-contig.fna") +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# shutil.copyfile(tbl_init, tblout) +# gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# shutil.copyfile(gff_init, gffout) +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# assert not ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, +# gene_dir, rep_dir, gff_dir, logger[1]) +# # Check that all files were not created +# assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) +# msg = ("Unknown header format >EPKOMDHM_i00002 hypothetical protein in " +# "test/data/annotate/exp_files/B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/" +# "test.0417.00002.ffn.\n" +# "Gen file will not be created.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg +# # remove tblout which was copied for this test +# os.remove(tblout) +# os.remove(gffout) + + +# def test_format1genome_problemprt(): +# """ +# Test that formatting a genome (making .prt, .gen, .fna, .lst) works, with a genome +# which did not change name between prokka run and format step. +# """ +# logger = my_logger() +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-problems.fna-short-contig.fna") +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# # copy tbl without errors to error prokka dir +# tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# shutil.copyfile(tbl_init, tblout) +# # copy gff without errors to error prokka dir +# gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# shutil.copyfile(gff_init, gffout) +# # copy ffn without error to error prokka dir +# ffn_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".ffn") +# ffn_ok = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".ffn") +# ffn_error = ffn_ok + "-error" +# # change name of ffn file with error to keep it for later (used for tests) +# shutil.copyfile(ffn_ok, ffn_error) +# # copy ffn without error to prokka res (erasing ffn with error) +# shutil.copyfile(ffn_init, ffn_ok) +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# assert not ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, +# gene_dir, rep_dir, gff_dir, logger[1]) +# # Check that all files were not created +# assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) +# msg = ("Unknown header format >EPKOMDHM00003 hypothetical protein in " +# "test/data/annotate/exp_files/B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/" +# "test.0417.00002.faa. " +# "Error: invalid literal for int() with base 10: '>EPKOMDHM00003'\n" +# "Prt file not created from test/data/annotate/exp_files/" +# "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes/test.0417.00002.faa.") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg +# # remove files which were copied for this test (tblout). And rename ffn with errors +# # to its original name. +# os.rename(ffn_error, ffn_ok) +# os.remove(tblout) +# os.remove(gffout) + + +# def test_format1genome_problemgff(): +# """ +# Test that formatting a genome (making .prt, .gen, .fna, .lst) returns an error message +# and does not create any output file if there is a problem while converting the +# .ffn to .gen +# """ +# logger = my_logger() +# gpath = os.path.join("test", "data", "annotate", "genomes", +# "B2_A3_5.fasta-problems.fna-short-contig.fna") +# name = "test.0417.00002" +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# shutil.copyfile(tbl_init, tblout) +# gff_init = os.path.join("test", "data", "annotate", "test_files", "prokka_out_gff-error.gff") +# gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# shutil.copyfile(gff_init, gffout) +# lst_dir = os.path.join("test", "data", "annotate") +# prot_dir = lst_dir +# gene_dir = lst_dir +# rep_dir = lst_dir +# gff_dir = lst_dir +# assert not ffunc.format_one_genome(gpath, name, prok_path, lst_dir, prot_dir, +# gene_dir, rep_dir, gff_dir, logger[1]) +# # Check that all files were not created +# assert not os.path.isfile(os.path.join(lst_dir, name + ".lst")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".fna")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".prt")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gen")) +# assert not os.path.isfile(os.path.join(lst_dir, name + ".gff")) +# msg = ("Problem with your gff file. '##FASTA' is not a gff entry line, whereas it should " +# "correspond to '11249\t12328\tD\tCDS\ttest.0417.00002.i0002_00016\tNA\t" +# "| hypothetical protein | NA | NA'") +# q = logger[0] +# assert q.qsize() == 1 +# assert q.get().message == msg +# # remove tblout which was copied for this test +# os.remove(tblout) +# os.remove(gffout) + + +# def test_format_all(): +# """ +# Test that when giving a list of genomes, for which prokka ran without problem, +# they are formatted, with all expected files created. +# """ +# # genomes = {genome: [name, gpath, size, nbcont, l90]} +# initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] +# initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] +# gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-changeName.fna-short-contig.fna"] +# onames = ["test_runprokka_H299", "test.0417.00002"] +# gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] +# for f1, f2 in zip(initpaths, gpaths): +# shutil.copyfile(f1, f2) +# genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], +# gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] +# } +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# res_path = os.path.join("test", "data", "annotate") +# results = {gname: True for gname in gnames} +# skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, +# prok_path, threads=4) +# assert skipped == [] +# assert skipped_format == [] +# lstfiles = [os.path.join(res_path, "LSTINFO", name + ".lst") for name in onames] +# prtfiles = [os.path.join(res_path, "Proteins", name + ".prt") for name in onames] +# genfiles = [os.path.join(res_path, "Genes", name + ".gen") for name in onames] +# repfiles = [os.path.join(res_path, "Replicons", name + ".fna") for name in onames] +# gfffiles = [os.path.join(res_path, "gff3", name + ".gff") for name in onames] +# for f in lstfiles + prtfiles + genfiles + repfiles + gfffiles: +# assert os.path.isfile(f) +# shutil.rmtree(os.path.join(res_path, "LSTINFO")) +# shutil.rmtree(os.path.join(res_path, "Proteins")) +# shutil.rmtree(os.path.join(res_path, "Genes")) +# shutil.rmtree(os.path.join(res_path, "Replicons")) +# shutil.rmtree(os.path.join(res_path, "gff3")) +# for f in gpaths: +# os.remove(f) + + +# def test_format_all_result_false(): +# """ +# Test that when giving a list of 2 genomes, 1 for which prokka ran without problem, +# 1 for which prokka had problems (given with False in results), +# the correct genome is formatted, with all +# expected files created, and the genome with problems is not formatted. +# """ +# # genomes = {genome: [name, gpath, size, nbcont, l90]} +# initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] +# initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] +# gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-changeName.fna-short-contig.fna"] +# onames = ["test_runprokka_H299", "test.0417.00002"] +# gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] +# for f1, f2 in zip(initpaths, gpaths): +# shutil.copyfile(f1, f2) +# genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], +# gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] +# } +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# res_path = os.path.join("test", "data", "annotate") +# results = {gnames[0]: True, gnames[1]: False} +# skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, prok_path) +# assert skipped == ["B2_A3_5.fasta-changeName.fna-short-contig.fna"] +# assert skipped_format == [] +# lstfiles = os.path.join(res_path, "LSTINFO") +# prtfiles = os.path.join(res_path, "Proteins") +# genfiles = os.path.join(res_path, "Genes") +# repfiles = os.path.join(res_path, "Replicons") +# gfffiles = os.path.join(res_path, "gff3") +# assert os.path.isfile(os.path.join(lstfiles, onames[0] + ".lst")) +# assert not os.path.isfile(os.path.join(lstfiles, onames[1] + ".lst")) +# assert os.path.isfile(os.path.join(prtfiles, onames[0] + ".prt")) +# assert not os.path.isfile(os.path.join(prtfiles, onames[1] + ".prt")) +# assert os.path.isfile(os.path.join(genfiles, onames[0] + ".gen")) +# assert not os.path.isfile(os.path.join(genfiles, onames[1] + ".gen")) +# assert os.path.isfile(os.path.join(repfiles, onames[0] + ".fna")) +# assert not os.path.isfile(os.path.join(repfiles, onames[1] + ".fna")) +# assert os.path.isfile(os.path.join(gfffiles, onames[0] + ".gff")) +# assert not os.path.isfile(os.path.join(gfffiles, onames[1] + ".gff")) +# shutil.rmtree(os.path.join(res_path, "LSTINFO")) +# shutil.rmtree(os.path.join(res_path, "Proteins")) +# shutil.rmtree(os.path.join(res_path, "Genes")) +# shutil.rmtree(os.path.join(res_path, "Replicons")) +# shutil.rmtree(os.path.join(res_path, "gff3")) +# for f in gpaths: +# os.remove(f) + + +# def test_format_all_not_result(): +# """ +# Test that when giving a list of 2 genomes, but only 1 is in the results list (and prokka ran +# without problems for it), the correct genome is formatted, with all +# expected files created, and the other genome is not formatted, and does not appear in +# skipped list (as it was removed from the study before annotation step, probably by QC). +# """ +# # genomes = {genome: [name, gpath, size, nbcont, l90]} +# initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] +# initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] +# gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-changeName.fna-short-contig.fna"] +# onames = ["test_runprokka_H299", "test.0417.00002"] +# gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] +# for f1, f2 in zip(initpaths, gpaths): +# shutil.copyfile(f1, f2) +# genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], +# gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] +# } +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# res_path = os.path.join("test", "data", "annotate") +# results = {gnames[0]: True} +# skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, prok_path) +# assert skipped == [] +# assert skipped_format == [] +# lstfiles = os.path.join(res_path, "LSTINFO") +# prtfiles = os.path.join(res_path, "Proteins") +# genfiles = os.path.join(res_path, "Genes") +# repfiles = os.path.join(res_path, "Replicons") +# gfffiles = os.path.join(res_path, "gff3") +# assert os.path.isfile(os.path.join(lstfiles, onames[0] + ".lst")) +# assert not os.path.isfile(os.path.join(lstfiles, onames[1] + ".lst")) +# assert os.path.isfile(os.path.join(prtfiles, onames[0] + ".prt")) +# assert not os.path.isfile(os.path.join(prtfiles, onames[1] + ".prt")) +# assert os.path.isfile(os.path.join(genfiles, onames[0] + ".gen")) +# assert not os.path.isfile(os.path.join(genfiles, onames[1] + ".gen")) +# assert os.path.isfile(os.path.join(repfiles, onames[0] + ".fna")) +# assert not os.path.isfile(os.path.join(repfiles, onames[1] + ".fna")) +# assert os.path.isfile(os.path.join(gfffiles, onames[0] + ".gff")) +# assert not os.path.isfile(os.path.join(gfffiles, onames[1] + ".gff")) +# shutil.rmtree(os.path.join(res_path, "LSTINFO")) +# shutil.rmtree(os.path.join(res_path, "Proteins")) +# shutil.rmtree(os.path.join(res_path, "Genes")) +# shutil.rmtree(os.path.join(res_path, "Replicons")) +# shutil.rmtree(os.path.join(res_path, "gff3")) +# for f in gpaths: +# os.remove(f) + +# # probleme avec .fna de onames[0] qui n'est pas créé... + + +# def test_format_all_error(): +# """ +# Test that when giving a list of 2 genomes, prokka ran without problem for both. +# But a problem appears while formatting the 2nd one. So, the 2nd one is not formatted, +# and appears in skipped_format. The first one is formated, and check that all +# output files are created. +# """ +# # genomes = {genome: [name, gpath, size, nbcont, l90]} +# name = "test.0417.00002" +# initnames = ["H299_H561.fasta", "B2_A3_5.fasta-changeName.fna"] +# initpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in initnames] +# gnames = ["H299_H561.fasta-short-contig.fna", "B2_A3_5.fasta-problems.fna-short-contig.fna"] +# onames = ["test_runprokka_H299", "test.0417.00002"] +# gpaths = [os.path.join("test", "data", "annotate", "genomes", name) for name in gnames] +# for f1, f2 in zip(initpaths, gpaths): +# shutil.copyfile(f1, f2) +# genomes = {gnames[0]: [onames[0], gpaths[0], 12656, 3, 1], +# gnames[1]: [onames[1], gpaths[1], 456464645, 5, 1] +# } +# prok_path = os.path.join("test", "data", "annotate", "exp_files") +# res_path = os.path.join("test", "data", "annotate") +# tbl_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# tblout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".tbl") +# shutil.copyfile(tbl_init, tblout) +# gff_init = os.path.join(prok_path, "B2_A3_5.fasta-split5N.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# gffout = os.path.join(prok_path, "B2_A3_5.fasta-problems.fna-short-contig.fna-prokkaRes", +# name + ".gff") +# shutil.copyfile(gff_init, gffout) +# results = {gnames[0]: True, gnames[1]: True} +# skipped, skipped_format = ffunc.format_genomes(genomes, results, res_path, prok_path) +# assert skipped == [] +# assert skipped_format == ["B2_A3_5.fasta-problems.fna-short-contig.fna"] +# lstfiles = os.path.join(res_path, "LSTINFO") +# prtfiles = os.path.join(res_path, "Proteins") +# genfiles = os.path.join(res_path, "Genes") +# repfiles = os.path.join(res_path, "Replicons") +# gfffiles = os.path.join(res_path, "gff3") +# assert os.path.isfile(os.path.join(lstfiles, onames[0] + ".lst")) +# assert not os.path.isfile(os.path.join(lstfiles, onames[1] + ".lst")) +# assert os.path.isfile(os.path.join(prtfiles, onames[0] + ".prt")) +# assert not os.path.isfile(os.path.join(prtfiles, onames[1] + ".prt")) +# assert os.path.isfile(os.path.join(genfiles, onames[0] + ".gen")) +# assert not os.path.isfile(os.path.join(genfiles, onames[1] + ".gen")) +# assert os.path.isfile(os.path.join(repfiles, onames[0] + ".fna")) +# assert not os.path.isfile(os.path.join(repfiles, onames[1] + ".fna")) +# assert os.path.isfile(os.path.join(gfffiles, onames[0] + ".gff")) +# assert not os.path.isfile(os.path.join(gfffiles, onames[1] + ".gff")) +# shutil.rmtree(os.path.join(res_path, "LSTINFO")) +# shutil.rmtree(os.path.join(res_path, "Proteins")) +# shutil.rmtree(os.path.join(res_path, "Genes")) +# shutil.rmtree(os.path.join(res_path, "Replicons")) +# shutil.rmtree(os.path.join(res_path, "gff3")) +# os.remove(tblout) +# os.remove(gffout) +# for f in gpaths: +# os.remove(f)