diff --git a/PanACoTA/annotate_module/format_prodigal.py b/PanACoTA/annotate_module/format_prodigal.py
index d1005c142086ec6e4f26210a9299c25558930431..512f32d9a60298394d5eae07181862db4bf17119 100644
--- a/PanACoTA/annotate_module/format_prodigal.py
+++ b/PanACoTA/annotate_module/format_prodigal.py
@@ -29,7 +29,7 @@ logger = logging.getLogger("annotate.prodigal_format")
def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
- rep_dir, gff_dir):
+ rep_dir, gff_dir, logger, changed_name=False):
"""
Format the given genome, and create its corresponding files in the following folders:
@@ -79,9 +79,24 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
res_rep_file = os.path.join(rep_dir, name + ".fna")
res_gff_file = os.path.join(gff_dir, name + ".gff")
+ # Generate replicon file (same as input sequence but with gembase formatted headers). From
+ # this file, get contig names, to be used to generate gff file
+ contigs, sizes = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
+ if not contigs:
+ try:
+ os.remove(res_lst_file)
+ os.remove(res_gff_file)
+ os.remove(res_gene_file)
+ os.remove(res_prt_file)
+ os.remove(res_rep_file)
+ except OSError:
+ pass
+ logger.error("Problems while generating Replicon file for {}".format(name))
+ return False
+
# First, create .gen and .lst files. If they could not be formatted,
# remove those files, and return False with error message
- ok = create_gene_lst(gen_file, res_gene_file, res_lst_file, name, logger)
+ ok = create_gene_lst(contigs, gen_file, res_gene_file, res_lst_file, name, logger)
if not ok:
try:
os.remove(res_gen_file)
@@ -91,10 +106,8 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
logger.error(f"Problems while generating .gen and .lst files for {name}")
return False
- # Create replicon and gff files. Replicon is the same as input sequence but with
- # gembase formatted headers
- # From Replicon file, get contig names, and use them to generate gff file
- ok = create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name)
+ # Create gff files.
+ ok = create_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name)
# If problem while formatting the genome (rep or gff file), remove all
# already created files, and return False (genome not formatted) with error message.
if not ok:
@@ -128,7 +141,7 @@ def format_one_genome(gpath, name, prod_path, lst_dir, prot_dir, gene_dir,
return ok
-def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
+def create_gene_lst(contigs, gen_file, res_gen_file, res_lst_file, gpath, name, logger):
"""
Generate .gen file, from sequences contained in .ffn, but changing the
headers to match with gembase format.
@@ -136,12 +149,16 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
Parameters
----------
+ contigs : dict
+ {original_contig_name: gembase_contig_name}
gen_file : str
.ffn file generated by prodigal
res_gen_file : str
generated .gen file, to write in Genes directory
res_lst_file : str
generated .lst file to write in LSTINFO directory
+ gpath : str
+ path to the genome given to prodigal
name : str
gembase name of the genome to format
logger : logging.Logger
@@ -173,7 +190,6 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
prev_loc = "b"
# To start, the first gene is, by definition, at the border of the contig
loc = "b"
-
# Open files: .ffn prodigal to read, .gen and .lst gembase to create
with open(gen_file, "r") as ffn, open(res_gen_file, "w") as r_gen,\
open(res_lst_file, "w") as r_lst:
@@ -194,19 +210,26 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
# Get contig number from prodigal gene header: prodigal first part of header is:
# <original genome name contig name>_<protein number>
contig_name = "_".join(gname.strip().split("_")[:-1])
+
# If new contig:
# - previous gene was the last of its contig -> prev_loc = "b" ;
# - the current gene is the first of its contig (loc = "b")
# - we must increment the contig number
if contig_name != prev_cont_name:
- contig_num += 1
+ # Check that this contig name is in the list, and get its gembase contig number
+ if contig_name in contigs:
+ contig_num = contigs[contig_name].split(".")[-1]
+ # if not in the list, problem, return false
+ else:
+ logger.error(f"{contig_name} found in {gen_file} does not exist in {gpath}.")
+ return False
prev_loc = 'b'
loc = 'b'
# If not new contig. If prev_loc == 'b', previous gene is the first protein
# of this contig.
# Current gene will be inside the contig (except if new contig for the next gene,
# meaning only 1 gene in the contig)
- if contig_name == prev_cont_name and prev_loc == "b":
+ else:
loc = 'i'
# If it is the first gene of the genome, we do not have any "previous gene"
@@ -259,69 +282,7 @@ def create_gene_lst(gen_file, res_gen_file, res_lst_file, name, logger):
return True
-def create_rep_gff(gpath, res_rep_file, res_gff_file, res_lst_file, gff_file, name):
- """
- Create replicon file by changing the headers in the input sequence.
- While changing the headers, get their name and size to put in gff file.
-
- Then, create gff file with the gene names and contig sizes
-
- Parameters
- ----------
- gpath : str
- path to the genome sequence
- res_rep_file : str
- path to the new file, containing 'gpath' sequence, but with 'gembase_name' in headers
- res_gff_file : str
- path to the gff file that must be created in result database
- res_lst_file : str
- path to the lst file that was created in result database in the previous step
- gff_file : str
- path to gff file generated by prodigal
- name : str
- genome name to use (species.date.strain)
-
- Returns
- -------
- bool :
- True if everything went well, False otherwise
-
- """
- contigs = create_replicons(gpath, res_rep_file, name)
- if contigs:
- return create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs)
- return False
-
-
-def create_replicons(gpath, res_rep_file, name):
- """
- Create Replicons file:
-
- Read input sequence, and replace headers with the new contig headers (found while creating
- lst and gen files)
-
- Parameters
- ----------
- gpath : str
- path to the genome sequence
- res_rep_file : str
- path to the new file, containing 'gpath' sequence, but with 'gembase_named' headers
- name : str
- genome name to use (species.date.strain)
-
- Returns
- -------
- list :
- List of all contigs with their size ["contig1;size1", "contig2;size2" ...]
-
- """
- # Change headers of replicon file to put into gembase format, and save it in res_rep_file
- contigs, _ = utils.get_genome_contigs_and_rename(name, gpath, res_rep_file)
- # Return each contig name with its size.
- return contigs
-
-
-def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs):
+def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs, sizes):
"""
Create .gff3 file.
@@ -350,9 +311,10 @@ def create_gff(gpath, gff_file, res_gff_file, res_lst_file, contigs):
path to the gff file that must be created in result database
res-lst_file : str
path to the lst file that was created in result database in the previous step
- contigs : list of str
- list of contig names with their size. ["contig1"\t"size1", "contig2"\t"size2" ...]
-
+ contigs : dict
+ dict of contig names with their size. ["original_name": "gembase_name"]
+ sizes : dict
+ dict of contig gembase names with their sizes {"gembase_name": size}
Returns
-------
bool
diff --git a/test/data/annotate/exp_files/res_create_gene_lst_prodigal.lst b/test/data/annotate/exp_files/res_create_gene_lst_prodigal.lst
new file mode 100644
index 0000000000000000000000000000000000000000..69198e4186d33f84fc8449e853fceeace73cce2c
--- /dev/null
+++ b/test/data/annotate/exp_files/res_create_gene_lst_prodigal.lst
@@ -0,0 +1,14 @@
+287 787 D CDS test.0417.00002.0001b_00001 NA | NA | NA | i | NA
+4416 6068 D CDS test.0417.00002.0001i_00002 NA | NA | NA | i | NA
+9000 12000 C CDS test.0417.00002.0001b_00003 NA | NA | NA | i | NA
+77 1237 D CDS test.0417.00002.0002b_00004 NA | NA | NA | info | NA
+1279 2346 D CDS test.0417.00002.0002i_00005 NA | NA | NA | fefer | NA
+2419 3000 D CDS test.0417.00002.0002i_00006 NA | NA | NA | a | NA
+3500 4000 D CDS test.0417.00002.0002i_00007 NA | NA | NA | vfvfdv | NA
+4632 5000 C CDS test.0417.00002.0002b_00008 NA | NA | NA | fdf | NA
+3399 4538 D CDS test.0417.00002.0003b_00009 NA | NA | NA | fdfd | NA
+4535 7888 D CDS test.0417.00002.0004b_00010 NA | NA | NA | info | NA
+7854 9491 D CDS test.0417.00002.0007b_00011 NA | NA | NA | ffff | NA
+9525 11285 D CDS test.0417.00002.0007i_00012 NA | NA | NA | sdfdfd | NA
+11249 12328 D CDS test.0417.00002.0007b_00013 NA | NA | NA | b | NA
+
diff --git a/test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa b/test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa
new file mode 100644
index 0000000000000000000000000000000000000000..53043bbcf60aaa1b5778f049488964c2ee5d228e
--- /dev/null
+++ b/test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa
@@ -0,0 +1,33 @@
+>JGIKIPgffgIJ_00001 # 287 # 787 # 1 # i
+aacgcgcatccagcgctagagacgctctcgacgagc
+cagatctgca
+cacaacaggtcgcgctcg
+>JGIKIPgffgIJ_00005 # 4416 # 6068 # 1 # i
+cagagatgcccc
+ccgcgttt
+>JGIKIPgffgIJ_00008 # 9000 # 12000 # -1 #i
+aaccggcctcgcgcatcgcatcagcag
+>toto_00009 # 77 # 1237 # 1 # info
+acgctagagagctcgcgctaagagatc
+>toto_00010 # 1279 # 2346 # 1 # fefer
+CCGAATAGCGCGCTCAGAGAGAGAGGA
+CGATAGCTCTCGCA
+CCGCATAGC
+>toto_00011 # 2419 # 3000 # 1 # a
+AAGCTCGCTAGACNNCTCGATACNNCTCGGATAGC
+>toto_00011 # 3500 # 4000 # 1 # vfvfdv
+CAGATAGCNNCGCTAGAGGAGAGCTCTCGGAGA
+>toto_00011 # 4632 # 5000 # -1 # fdf
+CCGAGATCGCGCGCGCTCTTCTCGAGA
+>other_header_00013 # 3399 # 4538 # 1 # fdfd
+AGCTCTCGAGAGGAGCGCTCGA
+CGCTCGCGCA
+CCTATAGGAACCACCGGGG
+>my contig_00014 # 4535 # 7888 # 1 # info
+CAGGATAGCGCGCTCAGAG
+>contname_03015 # 7854 # 9491 # 1 # ffff
+AGATCCGCGCGCTATAGAGC
+>contname_03016 # 9525 # 11285 # 1 # sdfdfd
+AAGGATCTCTCGCGAGAGGA
+>contname_03017 # 11249 # 12328 # 1 # b
+CCAGGATAGCGCGCGC
diff --git a/test/test_unit/test_annotate/test_format_prodigal.py b/test/test_unit/test_annotate/test_format_prodigal.py
new file mode 100644
index 0000000000000000000000000000000000000000..43f8bb2d9216f03868e5ea65b543584c47be3017
--- /dev/null
+++ b/test/test_unit/test_annotate/test_format_prodigal.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+"""
+Unit tests for general_format_functions.py
+"""
+
+import os
+import logging
+import shutil
+from io import StringIO
+import pytest
+
+import PanACoTA.annotate_module.format_prodigal as prodigalfunc
+import PanACoTA.utils as utils
+import test.test_unit.utilities_for_tests as tutil
+
+ANNOTEDIR = os.path.join("test", "data", "annotate")
+GENOMES_DIR = os.path.join(ANNOTEDIR, "genomes")
+EXP_ANNOTE = os.path.join(ANNOTEDIR, "exp_files")
+TEST_ANNOTE = os.path.join(ANNOTEDIR, "test_files")
+GENEPATH = os.path.join(ANNOTEDIR, "generated_by_unit-tests")
+
+LOGFILE_BASE = os.path.join(GENEPATH, "logs", "logfile")
+LOGFILES = [LOGFILE_BASE + ext for ext in [".log", ".log.debug", ".log.details", ".log.err"]]
+
+@pytest.fixture(autouse=True)
+def setup_teardown_module():
+ """
+ Remove log files at the end of this test module
+
+ Before each test:
+ - init logger
+ - create directory to put generated files
+
+ After:
+ - remove all log files
+ - remove directory with generated results
+ """
+ os.mkdir(GENEPATH)
+ # utils.init_logger(LOGFILE_BASE, 0, 'test_fastme', verbose=1)
+ print("setup")
+
+ yield
+ # shutil.rmtree(GENEPATH)
+ # for f in LOGFILES:
+ # if os.path.exists(f):
+ # os.remove(f)
+ print("teardown")
+
+
+def test_create_gen_lst(caplog):
+ """
+ Check that generated gen and lst files are as expected.
+ In the test file, all genomes have names different from gembase name
+ This test file contains the following aspects:
+ - gene in D strand (start < end)
+ - gene in C strand (start > end)
+ - CDS features
+ - contigs with more than 2 genes
+ - contig with only 2 genes (both 'b' loc)
+ - contig with 1 gene ('b' loc)
+ - contig without gene (should be skipped)
+ """
+ caplog.set_level(logging.DEBUG)
+ logger = logging.getLogger("test_prodigal")
+ genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.faa")
+ # contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50",
+ # "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50",
+ # "test.0417.00002.0007\t50"]
+ contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001",
+ "toto": "test.0417.00002.0002",
+ "other_header": "test.0417.00002.0003",
+ "my contig": "test.0417.00002.0004",
+ "bis": "test.0417.00002.0005",
+ "ter": "test.0417.00002.0006",
+ "contname": "test.0417.00002.0007"
+ }
+ name = "test.0417.00002"
+ res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen")
+ res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst")
+ gpath = "original_genome_name"
+ assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger)
+ exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst")
+ assert tutil.compare_order_content(exp_lst, res_lst_file)
+
+
+def test_create_gen_lst_cont_unknown(caplog):
+ """
+ A contig name in the gen file does not exist -> error message, and all result files must be removed for this genome
+ """
+ caplog.set_level(logging.DEBUG)
+ logger = logging.getLogger("test_prodigal")
+ genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.faa")
+ contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001",
+ "toto": "test.0417.00002.0002",
+ "other_header": "test.0417.00002.0003",
+ "my_contig": "test.0417.00002.0004",
+ "bis": "test.0417.00002.0005",
+ "ter": "test.0417.00002.0006",
+ "contname": "test.0417.00002.0007"
+ }
+ name = "test.0417.00002"
+ res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen")
+ res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst")
+ gpath = "original_genome_name"
+ assert not prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger)
+ assert ("my contig fount in test/data/annotate/test_files/original_name.fna-prodigalRes/prodigal_out_for_test.faa "
+ "does not exist in original_genome_name")
+
+
+def test_create_gen_lst(caplog):
+ """
+ Check that generated gen and lst files are as expected.
+ In the test file, all genomes have names different from gembase name
+ This test file contains the following aspects:
+ - gene in D strand (start < end)
+ - gene in C strand (start > end)
+ - CDS features
+ - contigs with more than 2 genes
+ - contig with only 2 genes (both 'b' loc)
+ - contig with 1 gene ('b' loc)
+ - contig without gene (should be skipped)
+ """
+ caplog.set_level(logging.DEBUG)
+ logger = logging.getLogger("test_prodigal")
+ genfile = os.path.join(TEST_ANNOTE, "original_name.fna-prodigalRes", "prodigal_out_for_test.gff")
+ # contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50",
+ # "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50",
+ # "test.0417.00002.0007\t50"]
+ contigs = {"JGIKIPgffgIJ": "test.0417.00002.0001",
+ "toto": "test.0417.00002.0002",
+ "other_header": "test.0417.00002.0003",
+ "my contig": "test.0417.00002.0004",
+ "bis": "test.0417.00002.0005",
+ "ter": "test.0417.00002.0006",
+ "contname": "test.0417.00002.0007"
+ }
+ name = "test.0417.00002"
+ res_gen_file = os.path.join(GENEPATH, "prodigal_res.gen")
+ res_lst_file = os.path.join(GENEPATH, "prodigal_res.lst")
+ gpath = "original_genome_name"
+ assert prodigalfunc.create_gene_lst(contigs, genfile, res_gen_file, res_lst_file, gpath, name, logger)
+ exp_lst = os.path.join(EXP_ANNOTE, "res_create_gene_lst_prodigal.lst")
+ assert tutil.compare_order_content(exp_lst, res_lst_file)
+
+# # def test_tbl_to_lst_not_changed_names(caplog):
+# """
+# Check that generated lstinfo file is as expected, when the genome name is the same as
+# it already was in the genome given to prokka.
+# The test tblfile contains the following aspects:
+# - gene in D strand (start < end)
+# - gene in C strand (start > end)
+# - CDS features (some with all info = ECnumber, gene name, product etc. ;
+# some with missing info)
+# - tRNA type
+# - repeat_region type (*2)
+# - contigs with more than 2 genes
+# - contig with only 2 genes (both 'b' loc)
+# - contig with 1 gene ('b' loc)
+# - contig without gene (should be skipped)
+# """
+# caplog.set_level(logging.DEBUG)
+# logger = logging.getLogger("test_prokka")
+# tblfile = os.path.join(TEST_ANNOTE, "original_name.fna-prokkaRes", "prokka_out_for_test.tbl")
+# lstfile = os.path.join(GENEPATH, "res_test_tbl2lst.lst")
+# contigs = ["test.0417.00002.0001\t50", "test.0417.00002.0002\t50", "test.0417.00002.0003\t50",
+# "test.0417.00002.0004\t50", "test.0417.00002.0005\t50", "test.0417.00002.0006\t50",
+# "test.0417.00002.0007\t50"]
+# name = "test.0417.00002"
+# assert prokkafunc.tbl2lst(tblfile, lstfile, contigs, name, logger, changed_name=False)
+# exp_lst = os.path.join(EXP_ANNOTE, "res_tbl2lst.lst")
+# assert tutil.compare_order_content(exp_lst, lstfile)
+