Adapt tests with removal of crispr consideration

ae649ae1 · Amandine PERRIN · b972323d · ae649ae1 · ae649ae1 · ae649ae1
Commit ae649ae1 authored 4 years ago by Amandine PERRIN
--- a/PanACoTA/annotate_module/format_prodigal.py
+++ b/PanACoTA/annotate_module/format_prodigal.py
@@ -52,6 +52,7 @@ July 2019
 import os
 import shutil
+import glob
 import logging
 import PanACoTA.utils as utils
@@ -100,9 +101,9 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
    prodigal_dir = os.path.join(prod_path, os.path.basename(gpath) + "-prodigalRes")
    # Get prodigal result files
-    prot_file = os.path.join(prodigal_dir, name + ".faa")
+    prot_file = glob.glob(os.path.join(prodigal_dir, "*.faa"))[0]
-    gen_file = os.path.join(prodigal_dir, name + ".ffn")
+    gen_file = glob.glob(os.path.join(prodigal_dir, "*.ffn"))[0]
-    gff_file = os.path.join(prodigal_dir, name + ".gff")
+    gff_file = glob.glob(os.path.join(prodigal_dir, "*.gff"))[0]
    # Define names for generated gembase files
    res_prot_file = os.path.join(prot_dir, name + ".prt")
@@ -275,7 +276,7 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
                # If it is not the first gene of the genome, write previous gene information
                if prev_start != "":
                    # Write line in LSTINFO file, + header and sequence to the gene file
-                    _, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0,
+                    lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
                                               prev_loc, name, prev_cont_num, "NA", prev_info,
                                               "NA", prev_strand, prev_start, prev_end, r_lst)
                    gfunc.write_header(lstline, r_gen)
@@ -302,7 +303,7 @@ def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name):
        # Otherwise, nothing to write
        if prev_start != "":
            prev_loc = "b"
-            _, lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA", 0,
+            lstline = gfunc.write_gene("CDS", locus_num, "NA", "NA",
                                       prev_loc, name, prev_cont_num, "NA", prev_info, "NA",
                                       prev_strand, prev_start, prev_end, r_lst)
            gfunc.write_header(lstline, r_gen)

--- a/PanACoTA/annotate_module/format_prokka.py
+++ b/PanACoTA/annotate_module/format_prokka.py
@@ -238,9 +238,6 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
    bool :
        True if genome name used in lstfile and prokka tblfile are the same, False otherwise
    """
-    # Number CRISPRs. By default, 0 CRISPR -> next one will be CRISPR1
-    crispr_num = 1
-    crispr = False
    # Protein localisation in contig (b = border ; i = inside)
    cont_loc = "b"
    prev_cont_loc = "b"
@@ -325,8 +322,8 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
                    # If not first gene of the contig, write the previous gene to .lst file
                    # The first gene will be written while reading the 2nd gene
                    if start != "-1" and end != "-1" and not crispr:
-                        crispr_num, lstline = general.write_gene(feature_type, locus_num,
+                        lstline = general.write_gene(feature_type, locus_num,
-                                                                 gene_name, product, crispr_num,
+                                                     gene_name, product,
                                                     prev_cont_loc, genome,
                                                     prev_cont_num, ecnum, inf2,
                                                     db_xref, strand, start, end, lstf)
@@ -359,8 +356,8 @@ def tbl2lst(tblfile, lstfile, contigs, genome, gpath):
        # Write last feature
        if start != -1 and end != -1:
            prev_cont_loc = "b"
-            crispr_num, _ = general.write_gene(feature_type, locus_num, gene_name, product,
+            general.write_gene(feature_type, locus_num, gene_name, product,
-                                               crispr_num, prev_cont_loc, genome, prev_cont_num,
+                               prev_cont_loc, genome, prev_cont_num,
                               ecnum, inf2, db_xref, strand, start, end, lstf)
    return True
@@ -516,7 +513,6 @@ def create_gen(ffnseq, lstfile, genseq):
    """
    problem = False
    write = True  # Write next sequence
-    crispr_id = 1
    with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen:
        for line_ffn in ffn:
            # Ignore gene that we do not want to write (should be a crispr)

--- a/PanACoTA/annotate_module/general_format_functions.py
+++ b/PanACoTA/annotate_module/general_format_functions.py
@@ -65,8 +65,7 @@ from PanACoTA.annotate_module import format_prodigal as fprodigal
 main_logger = logging.getLogger("annotate.geneffunc")
-def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False,
+def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, quiet=False):
-                   changed_name=False):
    """
    For all genomes which were annotated (by prokka or prodigal), reformat them
    in order to have, in 'res_path', the following folders:
@@ -80,7 +79,8 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q
    Parameters
    ----------
    genomes_ok : dict
-        genomes to format (annotation was OK) -> {genome: [name, gpath, size, nbcont, l90]}
+        genomes to format (annotation was OK) ->
+        {genome: [name, gpath, to_annot, size, nbcont, l90]}
    res_path : str
        path to folder where the 4 directories must be created
    annot_path : str
@@ -91,9 +91,6 @@ def format_genomes(genomes_ok, res_path, annot_path, prodigal_only, threads=1, q
        number of threads to use to while formatting genomes
    quiet : bool
        True if nothing must be sent to stderr/stdout, False otherwise
-    changed_name : bool
-        True if contig names have been changed (cutn != 0) -> contig names end by '_num',
-        False otherwise.
    Returns
    -------
@@ -216,7 +213,7 @@ def handle_genome(args):
    return ok_format, genome
-def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
+def write_gene(gtype, locus_num, gene_name, product, cont_loc,
               genome, cont_num, ecnum, inf2, db_xref, strand, start, end, lstopenfile):
    """
    Write given gene to output file
@@ -231,11 +228,6 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
        gene name found by prokka/prodigal ("NA" if no gene name -> Always the case with Prodigal)
    product : str
        found by prokka/Prodigal, "NA" if no product (always the case for prodigal)
-    crispr_num : int
-        current crispr number. In prokka tbl, CRISPRs are not numbered, they all
-        have the same name. We name them by adding a unique number to each CRISPR. If the current
-        gene to add is a CRISPR, this number will be incremented and returned. If not, this same
-        number will be returned.
    cont_loc : str
        'i' if the gene is inside a contig, 'b' if its on the border (first or last gene
        of the contig)
@@ -260,16 +252,9 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
    Returns
    -------
-    tuple :
+    str :
-        Current crispr number, lstline
+        lstline
    """
-    # if last gene was a crispr
-    if gtype == "repeat_region":
-        gtype = "CRISPR"
-        locus_num = "CRISPR" + str(crispr_num)
-        gene_name = "crispr"
-        product = "crispr-array"
-        crispr_num += 1
    locus_name = "{}.{}{}_{}".format(genome, str(cont_num).zfill(4), cont_loc,
                                     str(locus_num).zfill(5))
    # If '|' character found in those fields, replace by '_' to avoid problems while parsing
@@ -279,7 +264,7 @@ def write_gene(gtype, locus_num, gene_name, product, crispr_num, cont_loc,
                                        db_xref.replace("|", "_"))
    lst_line = "\t".join([start, end, strand, gtype, locus_name, gene_name, more_info])
    lstopenfile.write(lst_line + "\n")
-    return crispr_num, lst_line
+    return lst_line
 def write_header(lstline, outfile):

--- a/test/data/annotate/exp_files/res_create_gff_prodigal.gff
+++ b/test/data/annotate/exp_files/res_create_gff_prodigal.gff
 ##gff-version  3
-##sequence-region	test.0417.00002.0001	1	14000
+##sequence-region	test.0417.00002.0001	1	84
-##sequence-region	test.0417.00002.0002	1	5000
+##sequence-region	test.0417.00002.0002	1	103
-##sequence-region	test.0417.00002.0003	1	4600
+##sequence-region	test.0417.00002.0003	1	122
-##sequence-region	test.0417.00002.0004	1	8000
+##sequence-region	test.0417.00002.0004	1    35
-##sequence-region	test.0417.00002.0005	1	1
+##sequence-region	test.0417.00002.0005	1	198
-##sequence-region	test.0417.00002.0006	1	10
+##sequence-region	test.0417.00002.0006	1	128
-##sequence-region	test.0417.00002.0007	1	15000
+##sequence-region	test.0417.00002.0007	1	85
 test.0417.00002.0001	Prodigal:2.6	CDS	287	787	.	+	0	ID=test.0417.00002.0001b_00001;inference=ab initio prediction:Prodigal:2.6;locus_tag=EPKOMDHM_00001;product=hypothetical protein
 test.0417.00002.0001	Prodigal:2.6	CDS	4416	6068	.	+	0	ID=test.0417.00002.0001i_00002;Name=yiaD;gene=yiaD;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:P37665;locus_tag=EPKOMDHM_00005;product=putative lipoprotein YiaD
 test.0417.00002.0001	Prodigal:2.6	CDS	9000	12002	.	-	0	ID=test.0417.00002.0001b_00003;Name=vgrG1;gene=vgrG1;inference=ab initio prediction:Prodigal:2.6,similar to AA sequence:UniProtKB:Q9HI36;locus_tag=EPKOMDHM_00006;product=Major exported protein

--- a/test/test_unit/test_annotate/test_format_prodigal.py
+++ b/test/test_unit/test_annotate/test_format_prodigal.py
@@ -129,13 +129,13 @@ def test_create_gff(caplog):
               "ter": "test.0417.00002.0006",
               "contname": "test.0417.00002.0007"
               }
-    sizes = {"test.0417.00002.0001": 14000,
+    sizes = {"test.0417.00002.0001": 84,
-             "test.0417.00002.0002": 5000,
+             "test.0417.00002.0002": 103,
-             "test.0417.00002.0003": 4600,
+             "test.0417.00002.0003": 122,
-             "test.0417.00002.0004": 8000,
+             "test.0417.00002.0004": 35,
-             "test.0417.00002.0005": 1,
+             "test.0417.00002.0005": 198,
-             "test.0417.00002.0006": 10,
+             "test.0417.00002.0006": 128,
-             "test.0417.00002.0007": 15000,
+             "test.0417.00002.0007": 85,
            }
    res_gff_file = os.path.join(GENEPATH, "prodigal_res.gff")
    exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst")
@@ -383,7 +383,15 @@ def test_format_1genome_emptygpath(caplog):
    # Create empty file, that we give to prodigal for formatting step
    gpath =  os.path.join(GENEPATH, "original_name-empty.fna")
    open(gpath, "w").close()
-    prod_path = TEST_ANNOTE
+    prod_path = GENEPATH
+    # Create prodigal result files (empty, then won't be read)
+    prodigal_dir = gpath + "-prodigalRes"
+    os.makedirs(prodigal_dir)
+    prodigal_faa = os.path.join(gpath + "-prodigalRes", "notread.faa")
+    prodigal_ffn = os.path.join(gpath + "-prodigalRes", "notread.ffn")
+    prodigal_gff = os.path.join(gpath + "-prodigalRes", "notread.gff")
+    for file in [prodigal_faa, prodigal_gff, prodigal_ffn]:
+        open(file, "w").close()
    # Generate result folders
    prot_dir = os.path.join(GENEPATH, "Proteins")
    lst_dir = os.path.join(GENEPATH, "LSTINFO")
@@ -428,12 +436,14 @@ def test_format_1genome_wrongffn(caplog):
    name = "prodigal.outtest.ok"
    # path to original genome, given to prodigal for annotation
    orig_gpath =  os.path.join(TEST_ANNOTE, "original_name.fna")
-    # In GENEPATH folder, create the original genome given to prodigal
+    orig_prodpath = orig_gpath + "-prodigalRes"
-    # (copy from test_file)
    used_gpath = os.path.join(GENEPATH, "original_name.fna")
    used_respath = used_gpath + "-prodigalRes"
-    os.makedirs(used_respath)
+    # Add original genome, and prodigal results to result folder
    shutil.copyfile(orig_gpath, used_gpath)
+    shutil.copytree(orig_prodpath, used_respath)
+    # In GENEPATH folder, create the original genome given to prodigal
+    # (copy from test_file)
    # Create gen_file with a header not existing
    with open(os.path.join(used_respath, "prodigal.outtest.ok.ffn"), "w") as ori:
        ori.write(">wrongheader # 1 # 2 # 1 # toto")
@@ -481,16 +491,14 @@ def test_format_1genome_wronggff(caplog):
    # path to original genome, given to prodigal for annotation
    orig_gpath =  os.path.join(TEST_ANNOTE, "original_name.fna")
+    orig_prodpath = orig_gpath + "-prodigalRes"
    # In generated_by_tests folder, create the original genome given to prodigal
    # (copy from test_file)
    used_gpath = os.path.join(GENEPATH, "original_name.fna")
    used_respath = used_gpath + "-prodigalRes"
-    os.makedirs(used_respath)
+    # Add original genome, and prodigal results to result folder
    shutil.copyfile(orig_gpath, used_gpath)
-    # Copy ffn file generated by prodigal:
+    shutil.copytree(orig_prodpath, used_respath)
-    orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn")
-    used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn")
-    shutil.copyfile(orig_ffn, used_ffn)
    # Copy gff file, but modify to get wrong start position
    orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff")
    used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff")
@@ -548,21 +556,14 @@ def test_format_1genome_wrongprt(caplog):
    # path to original genome, given to prodigal for annotation
    orig_gpath =  os.path.join(TEST_ANNOTE, "original_name.fna")
+    orig_prodpath = orig_gpath + "-prodigalRes"
    # In generated_by_tests folder, create the original genome given to prodigal
    # (copy from test_file)
    used_gpath = os.path.join(GENEPATH, "original_name.fna")
    used_respath = used_gpath + "-prodigalRes"
-    os.makedirs(used_respath)
+    # Add original genome, and prodigal results to result folder
    shutil.copyfile(orig_gpath, used_gpath)
-    # Copy ffn file generated by prodigal:
+    shutil.copytree(orig_prodpath, used_respath)
-    orig_ffn = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.ffn")
-    used_ffn = os.path.join(used_respath, "prodigal.outtest.ok.ffn")
-    shutil.copyfile(orig_ffn, used_ffn)
-    # Copy gff file generated by prodigal:
-    orig_gff = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.gff")
-    used_gff = os.path.join(used_respath, "prodigal.outtest.ok.gff")
-    shutil.copyfile(orig_gff, used_gff)
-    # Copy prt file, but removing first proteins
    orig_faa = os.path.join(orig_gpath + "-prodigalRes", "prodigal.outtest.ok.faa")
    used_faa = os.path.join(used_respath, "prodigal.outtest.ok.faa")
    with open(orig_faa, "r") as faa, open(used_faa, "w") as faar: